# Cross-validation RoBERTa-based model using Tree-parzen Estimator algorithm 

Install required libraries

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
import matplotlib.pyplot as plt
from helpers import *
from implementations import *
import pickle
import os 
import json
from bson import json_util
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import KFold

To speed up the training and inferencing procedure we made use of the TPUs available on Google Colab. If you want to run this code on Google Colab, uncomment the cell below

In [4]:
# %tensorflow_version 2.x
# import tensorflow as tf
# print("Tensorflow version " + tf.__version__)

# try:
#   tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
#   print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
# except ValueError:
#   raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

# tf.config.experimental_connect_to_cluster(tpu)
# tf.tpu.experimental.initialize_tpu_system(tpu)
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Tensorflow version 2.7.0
Running on TPU  ['10.4.54.218:8470']
INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Initializing the TPU system: grpc://10.4.54.218:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.4.54.218:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


Load the data (we use the smaller dataset due to running time) and set required constants

In [5]:
# Small dataset (200'000 observations)
POS_TRAINING_PATH = "train_pos.txt"
NEG_TRAINING_PATH = "train_neg.txt"

# Full dataset (2.2 million obsevations)
POS_TRAINING_PATH_FULL = "train_pos_full.txt"
NEG_TRAINING_PATH_FULL = "train_neg_full.txt"

# Test data
TEST_PATH = "test_data.txt"

# set the required constants
RANDOM_STATE = 123
MAX_LENGTH= 55
USE_LSTM = False # set to false to cross validate the model without Bi-LSTM layers 

# Load both small and large training sets
df_full = loadData(POS_TRAINING_PATH, NEG_TRAINING_PATH)
# df_full = loadData(POS_TRAINING_PATH_FULL, NEG_TRAINING_PATH_FULL)

Load the test data

In [6]:
# Load the test set.
test_sent = []
idx = []
with open(TEST_PATH) as test:
    for line in test:
        split = line.split(",", 1)
        idx.append(int(split[0]))
        test_sent.append(split[1])

data = {'index':idx,'tweet':test_sent}
df_test = pd.DataFrame(data)
df_test.head()

Unnamed: 0,index,tweet
0,1,sea doo pro sea scooter ( sports with the port...
1,2,<user> shucks well i work all week so now i ca...
2,3,i cant stay away from bug thats my baby\n
3,4,<user> no ma'am ! ! ! lol im perfectly fine an...
4,5,"whenever i fall asleep watching the tv , i alw..."


Load and initialize the base RoBERTa tokenizer

In [7]:
# RoBERTa
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')

In [8]:
# we used the entire small dataset
training = df_full.sample(frac=1,random_state=123) 

In [10]:
training.head()

Unnamed: 0,tweet,sentiment
28285,"<user> awww , whizzy is happy\n",1
5080,#waystomakemehappy back rubs please\n,1
93291,this is torture ; i'm sat upstairs revising an...,0
3185,8am she's lucky she is cute for waking me so e...,1
61505,anyone who takes the mick out of anyone on #un...,1


Obtain the DataFrame of indeces and of attentions for each tweet. 

In [10]:
train_idx, train_att = batch_encode_cross_validation(tokenizer_roberta, training.tweet.to_list())

Define the function that builds the model

In [12]:
def build_model(transformer, dropout_rate, layer1_nodes, layer2_nodes, act_func, l1, l2, max_length=MAX_LENGTH, random_state=RANDOM_STATE):
    """""""""
    Build the model given the set of hyperparameters

    Input:
        - transformer:   the Transformer model to be used (either DistilBERT or RoBERTa)
        - dropout_rate:  dropout rate
        - layer1_nodes:  number of neurons in the frist layer of the MLP classifier
        - layer2_nodes:  number of neurons in the second layer of the MLP classifier
        - act_func:      activation function
        - l1:            l1 regularization weight
        - l2:            l2 regularization weight
        - max_length:    max sentence lenght (default=55) 
        - random_state:  seed for reproduceabilty
    Output:
        - model:         the built model ready to be compiled and trained
    """""""""

    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=random_state)

    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,),
                                            name='input_ids',
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,),
                                                  name='input_attention',
                                                  dtype='int32')

    # Access the last layer of the Transformer model.
    # It is a tf.Tensor of shape (batch_size, max_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]

    # From the last layer, we select the embedding of the [CLS] token
    cls_token = last_hidden_state[:, 0, :]
    
        # include 2 bi-direction LSTM layers
    if USE_LSTM == True:
        # obtain the vector representation of each word (do not select the [CLS] and [SEP] special tokens)
        embeddings = last_hidden_state[:, 1:-1, :]
        X = tf.keras.layers.Dropout(dropout_rateseed=random_state)(embeddings)
        # Bi-LSTM layers 
        X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=1024, kernel_regularizer=tf.keras.regularizers.l1_l2(l1=l1, l2=l2), kernel_initializer=weight_initializer, bias_initializer='zeros', return_sequences=True), merge_mode='ave')(X)        X = tf.keras.layers.Dropout(LAYER_DROPOUT)(X)
        X = tf.keras.layers.Dropout(dropout_rateseed=random_state)(X)
        X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=1024, kernel_regularizer=tf.keras.regularizers.l1_l2(l1=l1, l2=l2), kernel_initializer=weight_initializer, bias_initializer='zeros', return_sequences=False), merge_mode='ave')(X)
        X = tf.concat([cls_token, X], axis=1)
    else:
        X = cls_token
        
    # Build the 2-layers MLP classifier
    X = tf.keras.layers.Dropout(dropout_rateseed=random_state)(X)
    X = tf.keras.layers.Dense(layer1_nodes, activation=act_func, kernel_regularizer=tf.keras.regularizers.l1_l2(l1=l1, l2=l2), kernel_initializer=weight_initializer, bias_initializer='zeros')(X)
    X = tf.keras.layers.Dropout(dropout_rateseed=random_state)(X)
    if layer2_nodes != 0:
        X = tf.keras.layers.Dense(layer2_nodes, activation=act_func, kernel_regularizer=tf.keras.regularizers.l1_l2(l1=l1, l2=l2), kernel_initializer=weight_initializer, bias_initializer='zeros')(X)
        X = tf.keras.layers.Dropout(dropout_rateseed=random_state)(X)

    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1,
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(X)

    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)

    return model

Define the function that compiles and trains the model given the hyperparameters

In [13]:
def train_model(model, training_ids, training_attention, val_ids, val_attention, training_sentiment, val_sentiment, batch_size, lr, beta1, beta2, train_epochs=2, fine_tune_epochs=3):
    """""""""
    Compile and train the given model

    Input:
        - model:              the model built to be trained
        - training_ids:       training set of indeces
        - training_attention: training set of attention masks
        - val_ids:            validation set of indeces 
        - val_attention:      validation set of attention masks
        - training_sentiment: training labels
        - val_sentiment:      validation labels
        - batchsize:          batchsize to be used for training
        - lr:                 learning rate to be used for training
        - beta1:              beta_1 parameter of Adam optimizer
        - beta2:              beta_2 parameter of Adam optimizer
        - train_epochs:       number of epochs to train the full model (default=1)
        - fine_tune_epochs    number of epochs to fine tune the MLP classifier (default=3)
    Output:
        - val_acc:            best validation accuracy 
        - best_epoch:         fine_tune epochs that gave the best validation accuracy
    """""""""
    
    num_steps = tf.shape(training_ids)[0].numpy() // batch_size

    # compile the model using Adam optimizer
    model.compile(tf.keras.optimizers.Adam(lr=lr, beta_1=beta1, beta_2=beta2),
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=['accuracy'])

    print(model.summary())

    # Train the full model
    train_history1 = model.fit(
      x = [training_ids, training_attention],
      y = training_sentiment,
      epochs = train_epochs,
      batch_size = batch_size,
      steps_per_epoch = num_steps,
      validation_data = ([val_ids, val_attention], val_sentiment),
      verbose=2
    )

    # set the RoBERTa model to untrainable
    model.get_layer(index=2).trainable = False

    print(model.summary())

    # Fine tune the Neural Network built on top of the RoBERTa model
    train_history2 = model.fit(
      x = [training_ids, training_attention],
      y = training_sentiment,
      epochs = fine_tune_epochs,
      batch_size = batch_size,
      steps_per_epoch = num_steps,
      validation_data = ([val_ids, val_attention], val_sentiment),
      verbose=2
    )
    
    # Find the best validation accuracy
    val_acc = max(train_history2.history['val_accuracy'])
    print(val_acc)
    
    # Find the fine-tuning epoch that achieved the highest accuracy
    best_epoch = np.argmax(train_history2.history['val_accuracy']) + 1
    print(best_epoch)

    return val_acc, best_epoch


Define the function that will split the data into K-folds (we use 3 due to the runnning time) and trains and validates the current hyperparameters. Return the average accuracy over the folds

In [11]:
def hyperopt_cross_val(hyperparams):
    """""""""
    split the data into folds and run the training and validation on all the folds

    Input:
        - hyperparams:       set of hyperparameters to be used
    Output:
        - average accuracy:  average best valdiation accuracy 
        - best_epochs:       list of all the best epochs for each fold
    """""""""
    
    # uncomment the line below to use the TPU from Google Colab
    #with tpu_strategy.scope(): 

    kf = KFold(n_splits=3)
    accuracy = []
    best_epochs = []
    for train, test in kf.split(train_idx):
        # Initialize RoBERTa layer 
        RoBERTa = TFRobertaModel.from_pretrained('roberta-base')
        for layer in RoBERTa.layers:
            layer.trainable = True

        # Create datasets
        training_ids, training_attention = tf.convert_to_tensor(train_idx.iloc[train]), tf.convert_to_tensor(train_att.iloc[train])
        val_ids, val_attention = tf.convert_to_tensor(train_idx.iloc[test]), tf.convert_to_tensor(train_att.iloc[test])
        training_sentiment = training.sentiment.iloc[train]
        val_sentiment = training.sentiment.iloc[test]

        # Get hyperparameters
        (learning_rate, drop_rate, beta1, beta2, l1, l2, batchsize, activation, layer1_nodes, layer2_nodes) = hyperparams
        print("Training with the following hyperparameters:" )
        print(hyperparams)

        # Build model
        model = build_model(RoBERTa, drop_rate, layer1_nodes, layer2_nodes, activation, l1, l2, max_length=MAX_LENGTH, random_state=RANDOM_STATE)

        # compile model
        acc, epoch = train_model(model, training_ids, training_attention, val_ids, val_attention, training_sentiment, val_sentiment, batchsize, learning_rate, beta1, beta2)
        accuracy.append(acc)
        best_epochs.append(epoch)

    return np.mean(accuracy), best_epochs

Define the space of hyperparameters to be searched and run the Tree-Parzen Estimation algorithm for hyperparameter optimization 

In [None]:
eval_num = 0
best_acc = None
best_hyperparams = None

# space to be searched 
parameter_space = [
                    hp.choice('learning_rate',[5e-5, 5e-4, 5e-3, 5e-6, 1e-5, 1e-4, 1e-3, 1e-6]),
                    hp.choice('drop_rate', [0.3, 0.4, 0.5, 0.6]),
                    hp.choice('beta1',    [0.9, 0.95, 0.97, 0.99]),
                    hp.choice('beta2',    [ 0.9, 0.95, 0.97, 0.99]),
                    hp.choice('l2',    [0.0001, 0.0005, 0.001, 0.005, 0.01, 0]),
                    hp.choice('l1',    [0.0001, 0.0005, 0.001, 0.005, 0.01, 0]),
                    hp.choice('batchsize',    [ 128, 256, 64]),
                    hp.choice('activation',   ['relu', 'tanh', 'sigmoid']),
                    hp.choice('layer1_nodes',  [1024, 512, 256]),
                    hp.choice('layer2_nodes',  [512, 256, 128, 64, 0]),
                  ]

def f(hyperparams):
    """""""""
    Function the TPE algorithm aims to minimize

    Input:
        - hyperparams:  set of hyperparameters to be used
    Output:
        - result:       dictionary of results 
    """""""""
    
    global eval_num
    global best_acc
    global best_hyperparams
    
    # run cross-validation
    acc, epochs = hyperopt_cross_val(hyperparams)
    print('FINISHED THE HYPERPARAMTER TUNING')
    print(acc)
    
    # save globally optimal accuracy
    if best_acc is None or acc > best_acc:
        best_acc = acc
        best_hyperparams = hyperparams
    
    # save results 
    result = {'acc': -acc, 'epochs': str(epochs),  'status': STATUS_OK, 'space': hyperparams}
    print("THE RESULTS ARE:")
    print(result)
    save_json_result(str(acc), result)
    return result

def run_a_trial():
    """""""""
    Function that runs one iteration of the TPE algorithm
    """""""""
    
    max_evals = nb_evals = 1

    print("Attempt to resume a past training if it exists:")
    try:
        # https://github.com/hyperopt/hyperopt/issues/267
        with open('results.pkl', 'rb') as in_file:
              trials = pickle.load(in_file)
        # trials = pickle.load(open("results.pkl", "rb"))
        print("Found saved Trials! Loading...")
        max_evals = len(trials.trials) + nb_evals
        print("Rerunning from {} trials to add another one.".format(len(trials.trials)))
        best = fmin(f, parameter_space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
    except:
        trials = Trials()
        print("Starting from scratch: nesw trials.")

    print("FINISHED FMIN")
    with open('results.pkl', 'wb') as out_file:
        pickle.dump(trials, out_file)
    print('best:')
    print(best)
      
while True:
    print("Optimizing New Model")
    try:
        run_a_trial()
    except Exception as err:
        err_str = str(err)
        print(err_str)


Optimizing New Model
Attempt to resume a past training if it exists:
Found saved Trials! Loading...
Rerunning from 0 trials to add another one.
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.4, 0.9, 0.9, 0.01, 0.0005, 128, 'sigmoid', 512, 128)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 el)                            thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                               

  super(Adam, self).__init__(name, **kwargs)



 bidirectional (Bidirectional)  (None, 53, 1024)     14688256    ['dropout_37[0][0]']             
 dropout_38 (Dropout)           (None, 53, 1024)     0           ['bidirectional[0][0]']          
 tf.__operators__.getitem (Slic  (None, 768)         0           ['tf_roberta_model[0][0]']       
 ingOpLambda)                                                                                     
 bidirectional_1 (Bidirectional  (None, 1024)        16785408    ['dropout_38[0][0]']             
 )                                                                                                
 tf.concat (TFOpLambda)         (None, 1792)         0           ['tf.__operators__.getitem[0][0]'
                                                                 , 'bidirectional_1[0][0]']       
 dropout_39 (Dropout)           (None, 1792)         0           ['tf.concat[0][0]']              
 dense (Dense)                  (None, 512)          918016      ['dropout_39[0][0]']             
 dropout_4

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]






Instructions for updating:
use `experimental_local_results` instead.


Instructions for updating:
use `experimental_local_results` instead.
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 359s - loss: 368.1930 - accuracy: 0.5783 - val_loss: 3.5019 - val_accuracy: 0.5007 - 359s/epoch - 380ms/step

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 el)                            thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                    

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.4, 0.9, 0.9, 0.01, 0.0005, 128, 'sigmoid', 512, 128)
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_1 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 364s - loss: 368.1852 - accuracy: 0.5735 - val_loss: 3.5405 - val_accuracy: 0.5038 - 364s/epoch - 386ms/step

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_1 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                  

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.4, 0.9, 0.9, 0.01, 0.0005, 128, 'sigmoid', 512, 128)
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_2 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 373s - loss: 368.4350 - accuracy: 0.5865 - val_loss: 3.5080 - val_accuracy: 0.4974 - 373s/epoch - 395ms/step

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_2 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                  

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.3, 0.95, 0.97, 0.01, 0.0005, 128, 'tanh', 512, 256)
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_3 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                              

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 368s - loss: 377.3647 - accuracy: 0.8171 - val_loss: 2.7533 - val_accuracy: 0.8073 - 368s/epoch - 390ms/step

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_3 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                  

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.3, 0.95, 0.97, 0.01, 0.0005, 128, 'tanh', 512, 256)
Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_4 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                              

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 370s - loss: 376.8894 - accuracy: 0.8241 - val_loss: 2.7083 - val_accuracy: 0.8348 - 370s/epoch - 392ms/step

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_4 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                  

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.3, 0.95, 0.97, 0.01, 0.0005, 128, 'tanh', 512, 256)
Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_5 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                              

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 369s - loss: 377.2551 - accuracy: 0.8061 - val_loss: 2.7619 - val_accuracy: 0.7980 - 369s/epoch - 391ms/step

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_5 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                  

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-06, 0.6, 0.99, 0.97, 0.0001, 0.0005, 128, 'relu', 512, 256)
Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_6 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 367s - loss: 27.4303 - accuracy: 0.6710 - val_loss: 23.8451 - val_accuracy: 0.8491 - 367s/epoch - 388ms/step

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_6 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                  

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-06, 0.6, 0.99, 0.97, 0.0001, 0.0005, 128, 'relu', 512, 256)
Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_7 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 368s - loss: 27.4778 - accuracy: 0.6697 - val_loss: 23.9110 - val_accuracy: 0.8463 - 368s/epoch - 390ms/step

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_7 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                  

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-06, 0.6, 0.99, 0.97, 0.0001, 0.0005, 128, 'relu', 512, 256)
Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_8 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 370s - loss: 27.4326 - accuracy: 0.6871 - val_loss: 23.8720 - val_accuracy: 0.8489 - 370s/epoch - 392ms/step

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_8 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                  

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-05, 0.5, 0.97, 0.9, 0.001, 0.001, 256, 'tanh', 1024, 256)
Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_9 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                               

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 282s - loss: 259.7688 - accuracy: 0.7803 - val_loss: 223.5496 - val_accuracy: 0.8616 - 282s/epoch - 598ms/step

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_9 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-05, 0.5, 0.97, 0.9, 0.001, 0.001, 256, 'tanh', 1024, 256)
Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_10 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                              

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 282s - loss: 259.6718 - accuracy: 0.7860 - val_loss: 223.4798 - val_accuracy: 0.8587 - 282s/epoch - 598ms/step

Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_10 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                               

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-05, 0.5, 0.97, 0.9, 0.001, 0.001, 256, 'tanh', 1024, 256)
Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_11 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                              

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 281s - loss: 259.5987 - accuracy: 0.7861 - val_loss: 223.4059 - val_accuracy: 0.8486 - 281s/epoch - 596ms/step

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_11 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                               

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-05, 0.4, 0.95, 0.9, 0.005, 0.0001, 256, 'sigmoid', 1024, 512)
Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_12 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                          

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 281s - loss: 748.4526 - accuracy: 0.7594 - val_loss: 248.3278 - val_accuracy: 0.8363 - 281s/epoch - 596ms/step

Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_12 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                               

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-05, 0.4, 0.95, 0.9, 0.005, 0.0001, 256, 'sigmoid', 1024, 512)
Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_13 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                          

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 285s - loss: 747.7946 - accuracy: 0.7535 - val_loss: 247.8346 - val_accuracy: 0.8394 - 285s/epoch - 604ms/step

Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_13 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                               

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-05, 0.4, 0.95, 0.9, 0.005, 0.0001, 256, 'sigmoid', 1024, 512)
Model: "model_14"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_14 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                          

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 280s - loss: 748.1220 - accuracy: 0.7721 - val_loss: 248.8858 - val_accuracy: 0.8349 - 280s/epoch - 593ms/step

Model: "model_14"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_14 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                               

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-06, 0.6, 0.97, 0.99, 0.0005, 0, 128, 'sigmoid', 1024, 512)
Model: "model_15"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_15 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 372s - loss: 145.2019 - accuracy: 0.5402 - val_loss: 141.1254 - val_accuracy: 0.8131 - 372s/epoch - 394ms/step

Model: "model_15"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_15 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                               

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-06, 0.6, 0.97, 0.99, 0.0005, 0, 128, 'sigmoid', 1024, 512)
Model: "model_16"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_16 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 367s - loss: 145.2378 - accuracy: 0.5375 - val_loss: 141.1095 - val_accuracy: 0.8107 - 367s/epoch - 389ms/step

Model: "model_16"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_16 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                               

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-06, 0.6, 0.97, 0.99, 0.0005, 0, 128, 'sigmoid', 1024, 512)
Model: "model_17"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_17 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 369s - loss: 145.2586 - accuracy: 0.5274 - val_loss: 141.1787 - val_accuracy: 0.8125 - 369s/epoch - 391ms/step

Model: "model_17"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_17 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                               

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-05, 0.5, 0.99, 0.9, 0.01, 0.005, 128, 'sigmoid', 512, 512)
Model: "model_18"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_18 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 371s - loss: 2091.5554 - accuracy: 0.6921 - val_loss: 1499.4747 - val_accuracy: 0.8057 - 371s/epoch - 393ms/step

Model: "model_18"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_18 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                             

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-05, 0.5, 0.99, 0.9, 0.01, 0.005, 128, 'sigmoid', 512, 512)
Model: "model_19"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_19 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 372s - loss: 2091.1287 - accuracy: 0.7276 - val_loss: 1499.1377 - val_accuracy: 0.8459 - 372s/epoch - 394ms/step

Model: "model_19"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_19 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                             

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-05, 0.5, 0.99, 0.9, 0.01, 0.005, 128, 'sigmoid', 512, 512)
Model: "model_20"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_20 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 370s - loss: 2091.5513 - accuracy: 0.7239 - val_loss: 1499.3756 - val_accuracy: 0.8556 - 370s/epoch - 391ms/step

Model: "model_20"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_20 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                             

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0005, 0.6, 0.97, 0.95, 0.0001, 0.001, 256, 'tanh', 1024, 128)
Model: "model_21"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_21 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                           

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 287s - loss: 4.7777 - accuracy: 0.5012 - val_loss: 0.9863 - val_accuracy: 0.5007 - 287s/epoch - 608ms/step

Model: "model_21"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_21 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                   

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0005, 0.6, 0.97, 0.95, 0.0001, 0.001, 256, 'tanh', 1024, 128)
Model: "model_22"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_22 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                           

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 284s - loss: 4.7761 - accuracy: 0.5007 - val_loss: 0.9822 - val_accuracy: 0.5038 - 284s/epoch - 603ms/step

Model: "model_22"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_22 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                   

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0005, 0.6, 0.97, 0.95, 0.0001, 0.001, 256, 'tanh', 1024, 128)
Model: "model_23"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_23 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                           

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 285s - loss: 4.7419 - accuracy: 0.4994 - val_loss: 0.9499 - val_accuracy: 0.4974 - 285s/epoch - 604ms/step

Model: "model_23"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_23 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                   

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-05, 0.5, 0.95, 0.9, 0.001, 0, 256, 'tanh', 256, 128)
Model: "model_24"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_24 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                                   

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 282s - loss: 126.0422 - accuracy: 0.8264 - val_loss: 39.5688 - val_accuracy: 0.8688 - 282s/epoch - 596ms/step

Model: "model_24"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_24 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-05, 0.5, 0.95, 0.9, 0.001, 0, 256, 'tanh', 256, 128)
Model: "model_25"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_25 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                                   

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 282s - loss: 125.7987 - accuracy: 0.8297 - val_loss: 39.3083 - val_accuracy: 0.8680 - 282s/epoch - 598ms/step

Model: "model_25"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_25 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-05, 0.5, 0.95, 0.9, 0.001, 0, 256, 'tanh', 256, 128)
Model: "model_26"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_26 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                                   

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 284s - loss: 125.8828 - accuracy: 0.8252 - val_loss: 39.4103 - val_accuracy: 0.8650 - 284s/epoch - 601ms/step

Model: "model_26"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_26 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-05, 0.3, 0.97, 0.99, 0.001, 0.0001, 64, 'tanh', 1024, 64)
Model: "model_27"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_27 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                              

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


1888/1888 - 553s - loss: 40.7712 - accuracy: 0.8358 - val_loss: 0.7281 - val_accuracy: 0.8546 - 553s/epoch - 293ms/step

Model: "model_27"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_27 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-05, 0.3, 0.97, 0.99, 0.001, 0.0001, 64, 'tanh', 1024, 64)
Model: "model_28"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_28 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                              

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


1888/1888 - 555s - loss: 40.7250 - accuracy: 0.8396 - val_loss: 0.6513 - val_accuracy: 0.8630 - 555s/epoch - 294ms/step

Model: "model_28"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_28 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(5e-05, 0.3, 0.97, 0.99, 0.001, 0.0001, 64, 'tanh', 1024, 64)
Model: "model_29"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_29 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                              

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


1888/1888 - 555s - loss: 41.0602 - accuracy: 0.8361 - val_loss: 0.7625 - val_accuracy: 0.8584 - 555s/epoch - 294ms/step

Model: "model_29"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_29 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-05, 0.3, 0.95, 0.9, 0.005, 0.001, 128, 'sigmoid', 512, 512)
Model: "model_30"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_30 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                            

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 369s - loss: 1037.5884 - accuracy: 0.7947 - val_loss: 744.0745 - val_accuracy: 0.8562 - 369s/epoch - 391ms/step

Model: "model_30"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_30 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                              

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-05, 0.3, 0.95, 0.9, 0.005, 0.001, 128, 'sigmoid', 512, 512)
Model: "model_31"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_31 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                            

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 376s - loss: 1037.4694 - accuracy: 0.7927 - val_loss: 743.9675 - val_accuracy: 0.8574 - 376s/epoch - 399ms/step

Model: "model_31"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_31 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                              

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-05, 0.3, 0.95, 0.9, 0.005, 0.001, 128, 'sigmoid', 512, 512)
Model: "model_32"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_32 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                            

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 373s - loss: 1037.4562 - accuracy: 0.7957 - val_loss: 743.9309 - val_accuracy: 0.8468 - 373s/epoch - 395ms/step

Model: "model_32"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_32 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                              

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.6, 0.99, 0.95, 0, 0, 64, 'sigmoid', 256, 64)
Model: "model_33"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_33 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                                    

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


1888/1888 - 536s - loss: 0.6532 - accuracy: 0.5752 - val_loss: 0.6932 - val_accuracy: 0.5007 - 536s/epoch - 284ms/step

Model: "model_33"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_33 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                 

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.6, 0.99, 0.95, 0, 0, 64, 'sigmoid', 256, 64)
Model: "model_34"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_34 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                                    

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


1888/1888 - 542s - loss: 0.7249 - accuracy: 0.5085 - val_loss: 0.6931 - val_accuracy: 0.5038 - 542s/epoch - 287ms/step

Model: "model_34"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_34 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                 

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.6, 0.99, 0.95, 0, 0, 64, 'sigmoid', 256, 64)
Model: "model_35"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_35 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                                    

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


1888/1888 - 549s - loss: 0.6980 - accuracy: 0.5147 - val_loss: 0.6933 - val_accuracy: 0.5026 - 549s/epoch - 291ms/step

Model: "model_35"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_35 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                 

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.6, 0.9, 0.95, 0.01, 0.0005, 256, 'tanh', 1024, 512)
Model: "model_36"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_36 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 279s - loss: 835.1861 - accuracy: 0.8140 - val_loss: 27.0056 - val_accuracy: 0.8506 - 279s/epoch - 591ms/step

Model: "model_36"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_36 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.6, 0.9, 0.95, 0.01, 0.0005, 256, 'tanh', 1024, 512)
Model: "model_37"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_37 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 281s - loss: 835.1381 - accuracy: 0.8236 - val_loss: 27.0346 - val_accuracy: 0.8315 - 281s/epoch - 596ms/step

Model: "model_37"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_37 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(0.0001, 0.6, 0.9, 0.95, 0.01, 0.0005, 256, 'tanh', 1024, 512)
Model: "model_38"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_38 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                             

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


472/472 - 281s - loss: 835.3997 - accuracy: 0.8204 - val_loss: 28.2806 - val_accuracy: 0.8253 - 281s/epoch - 595ms/step

Model: "model_38"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_38 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-06, 0.5, 0.97, 0.97, 0.0001, 0.0005, 128, 'tanh', 512, 0)
Model: "model_39"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_39 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                              

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 366s - loss: 29.4535 - accuracy: 0.6932 - val_loss: 28.5152 - val_accuracy: 0.8402 - 366s/epoch - 388ms/step

Model: "model_39"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_39 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                 

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-06, 0.5, 0.97, 0.97, 0.0001, 0.0005, 128, 'tanh', 512, 0)
Model: "model_40"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_40 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                              

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 371s - loss: 29.4458 - accuracy: 0.7024 - val_loss: 28.5331 - val_accuracy: 0.8362 - 371s/epoch - 393ms/step

Model: "model_40"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_40 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                 

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-06, 0.5, 0.97, 0.97, 0.0001, 0.0005, 128, 'tanh', 512, 0)
Model: "model_41"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_41 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                              

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


944/944 - 373s - loss: 29.4829 - accuracy: 0.6818 - val_loss: 28.5353 - val_accuracy: 0.8362 - 373s/epoch - 395ms/step

Model: "model_41"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_41 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                 

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-05, 0.6, 0.9, 0.9, 0, 0.005, 64, 'tanh', 512, 256)
Model: "model_42"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_42 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                                     

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


1888/1888 - 556s - loss: 19.9864 - accuracy: 0.8164 - val_loss: 9.1086 - val_accuracy: 0.8640 - 556s/epoch - 294ms/step

Model: "model_42"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_42 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Training with the following hyperparameters:
(1e-05, 0.6, 0.9, 0.9, 0, 0.005, 64, 'tanh', 512, 256)
Model: "model_43"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_43 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                                     

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 55) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 1) dtype=int64>]


1888/1888 - 559s - loss: 20.2771 - accuracy: 0.8145 - val_loss: 9.5727 - val_accuracy: 0.8611 - 559s/epoch - 296ms/step

Model: "model_43"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 55)]         0           []                               
 input_attention (InputLayer)   [(None, 55)]         0           []                               
 tf_roberta_model_43 (TFRoberta  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 Model)                         thPoolingAndCrossAt               'input_attention[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 55,                                                
                                768),                                