In [34]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from datasets import load_dataset

In [35]:
model = TFAutoModel.from_pretrained("bert-base-uncased")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [36]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [37]:
inputs = tokenizer(['Hello world', 'Hi how are you'], padding=True, truncation=True,
                  return_tensors='tf')
inputs

{'input_ids': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[ 101, 7592, 2088,  102,    0,    0],
       [ 101, 7632, 2129, 2024, 2017,  102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [38]:
output = model(inputs)
output

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(2, 6, 768), dtype=float32, numpy=
array([[[-0.16888332,  0.13606355, -0.13940018, ..., -0.6251125 ,
          0.05217262,  0.36714536],
        [-0.3632745 ,  0.14121903,  0.8799885 , ...,  0.10433032,
          0.2887578 ,  0.37267894],
        [-0.69859415, -0.69879794,  0.06450251, ..., -0.22103661,
          0.00986893, -0.5939796 ],
        [ 0.83098257,  0.12366717, -0.15119013, ...,  0.10309545,
         -0.67792666, -0.26285172],
        [-0.40266633, -0.01928236,  0.5732502 , ..., -0.20656842,
          0.02338582,  0.20126349],
        [-0.6228408 , -0.27453488,  0.1811763 , ..., -0.12944865,
         -0.03839079, -0.05733156]],

       [[ 0.09286558, -0.02636361, -0.12239343, ..., -0.21063566,
          0.17386371,  0.17250973],
        [ 0.40742022, -0.05930945,  0.55234593, ..., -0.6790563 ,
          0.6555748 , -0.2945646 ],
        [-0.21155298, -0.6858643 , -0.46280792, ...,  0.15278494

In [39]:
emotions = load_dataset('SetFit/emotion')

  0%|          | 0/3 [00:00<?, ?it/s]

In [40]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
})

In [41]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [42]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [43]:
emotions_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [44]:
# setting 'input_ids', 'attention_mask', 'token_type_ids', and 'label'
# to the tensorflow format. Now if you access this dataset you will get these
# columns in `tf.Tensor` format

emotions_encoded.set_format('tf', 
                            columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

# setting BATCH_SIZE to 64.
BATCH_SIZE = 64

def order(inp):
    '''
    This function will group all the inputs of BERT
    into a single dictionary and then output it with
    labels.
    '''
    data = list(inp.values())
    return {
        'input_ids': data[1],
        'attention_mask': data[2],
        'token_type_ids': data[3]
    }, data[0]

# converting train split of `emotions_encoded` to tensorflow format
train_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['train'][:])
# set batch_size and shuffle
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000)
# map the `order` function
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# ... doing the same for test set ...
test_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

In [45]:
inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(64, 87), dtype=int64, numpy=
array([[ 101, 1045, 2066, ...,    0,    0,    0],
       [ 101, 1045, 2514, ...,    0,    0,    0],
       [ 101, 1045, 2123, ...,    0,    0,    0],
       ...,
       [ 101, 1045, 2064, ...,    0,    0,    0],
       [ 101, 1045, 2123, ...,    0,    0,    0],
       [ 101, 1045, 2514, ...,    0,    0,    0]])>, 'attention_mask': <tf.Tensor: shape=(64, 87), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'token_type_ids': <tf.Tensor: shape=(64, 87), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>} 

 tf.Tensor(
[3 2 2 1 0 0 3 1 1 4 1 1 0 0 2 1 1 1 1 1 0 1 2 1 4 1 0 1 1 4 1 

In [46]:
class BERTForClassification(tf.keras.Model):
    
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')
        
    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [47]:
classifier = BERTForClassification(model, num_classes=6)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [48]:
history = classifier.fit(
    train_dataset,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [49]:
classifier.evaluate(test_dataset)



[0.18074743449687958, 0.921500027179718]

In [65]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from datasets import load_dataset

In [66]:
model = TFAutoModel.from_pretrained("roberta-base")

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [67]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [68]:
inputs = tokenizer(['Hello world', 'Hi how are you'], padding=True, truncation=True,
                  return_tensors='tf')
inputs

{'input_ids': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[    0, 31414,   232,     2,     1,     1],
       [    0, 30086,   141,    32,    47,     2]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [69]:
output = model(inputs)
output


TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(2, 6, 768), dtype=float32, numpy=
array([[[-0.06132455,  0.08516992,  0.00078   , ..., -0.06125636,
         -0.05824584, -0.00582781],
        [-0.1364578 , -0.01367003,  0.06624176, ...,  0.03447215,
         -0.08437751,  0.14693369],
        [-0.14509705,  0.17469448,  0.1423138 , ..., -0.48851636,
         -0.19828328,  0.4867814 ],
        [-0.06250261,  0.08806741, -0.020095  , ..., -0.11117796,
         -0.06778044, -0.0407331 ],
        [-0.09004237,  0.01524699,  0.11763413, ..., -0.01570032,
         -0.08172794,  0.10898153],
        [-0.09004237,  0.01524699,  0.11763413, ..., -0.01570032,
         -0.08172794,  0.10898153]],

       [[-0.05453827,  0.10439846, -0.00956053, ..., -0.06943446,
         -0.04845492, -0.00215347],
        [-0.10498375,  0.1532619 ,  0.09994312, ..., -0.32217735,
          0.10816762, -0.06135899],
        [-0.19492397,  0.11645846,  0.09339952, ..., -0.29414424

In [70]:
emotions = load_dataset('SetFit/emotion')


  0%|          | 0/3 [00:00<?, ?it/s]

In [71]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
})

In [72]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [73]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [74]:
emotions_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [75]:
# setting 'input_ids', 'attention_mask', 'token_type_ids', and 'label'
# to the tensorflow format. Now if you access this dataset you will get these
# columns in `tf.Tensor` format

emotions_encoded.set_format('tf', 
                            columns=['input_ids', 'attention_mask', 'label'])

# setting BATCH_SIZE to 64.
BATCH_SIZE = 64

def order(inp):
    '''
    This function will group all the inputs of BERT
    into a single dictionary and then output it with
    labels.
    '''
    data = list(inp.values())
    return {
        'input_ids': data[1],
        'attention_mask': data[2],
#         'token_type_ids': data[3]
    }, data[0]

# converting train split of `emotions_encoded` to tensorflow format
train_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['train'][:])
# set batch_size and shuffle
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000)
# map the `order` function
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# ... doing the same for test set ...
test_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

In [76]:
inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(64, 88), dtype=int64, numpy=
array([[  0, 118, 619, ...,   1,   1,   1],
       [  0, 118, 619, ...,   1,   1,   1],
       [  0, 118, 619, ...,   1,   1,   1],
       ...,
       [  0, 118, 619, ...,   1,   1,   1],
       [  0, 118,  33, ...,   1,   1,   1],
       [  0, 757,  45, ...,   1,   1,   1]])>, 'attention_mask': <tf.Tensor: shape=(64, 88), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>} 

 tf.Tensor(
[1 1 0 1 1 1 0 0 1 2 1 4 0 1 1 0 3 1 1 1 0 0 1 0 1 1 0 0 0 1 3 5 4 1 4 0 1
 0 0 0 1 4 1 0 1 1 1 0 0 1 1 4 1 3 3 1 0 1 0 0 0 0 1 4], shape=(64,), dtype=int64)


In [77]:
class ROBERTAForClassification(tf.keras.Model):
    
    def __init__(self, roberta_model, num_classes):
        super().__init__()
        self.bert = roberta_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')
        
    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [78]:
classifier = ROBERTAForClassification(model, num_classes=6)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [79]:
history = classifier.fit(
    train_dataset,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [80]:
classifier.evaluate(test_dataset)



[0.16786357760429382, 0.9309999942779541]