# Build model architecture

In [1]:
# make sure you have - tensorflow-macos==2.9 - tensorflow-metal==0.5.0

In [2]:
from transformers import TFAutoModel
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

In [3]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

Metal device set to: Apple M1


2023-01-03 15:40:09.248729: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-03 15:40:09.248809: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identica

In [4]:
bert.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


Now we need to define the frame around Bert, we need:

- Two input layers (one for input IDs and one for attention mask).

- A post-bert dropout layer to reduce the likelihood of overfitting and improve generalization.

- Max pooling layer to convert the 3D tensors output by Bert to 2D.

- Final output activations using softmax for outputting categorical probabilities.

In [5]:
#two input layers, we ensure layer name variables match to dictionary keys in TF dataset
input_ids= tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

In [6]:

#transformer

# we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)
embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # access final activations (alread max-pooled) [1]

In [7]:
#classifier head


# convert bert embeddings into 5 output classes
x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(5, activation='softmax', name='outputs')(x)

In [8]:
# initialize the model

model = tf.keras.Model(inputs=[input_ids,mask], outputs=y)

In [9]:
# (optional) freeze bert layer
model.layers[2].trainable = False



In [10]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

Our model architecture is now setup, and we can initialize our training parameters like so:

In [11]:
optimizer = tf.keras.optimizers.Adam(lr=1e-5, decay=1e-6) #recommended values when training a bert model
loss = tf.keras.losses.CategoricalCrossentropy() #5 labels/categories
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

Now all we need to do is train our model. For this, we need to load in our training and validation datasets - which also requires our dataset element specs to be defined.

In [14]:
element_spec = ({'input_ids': tf.TensorSpec(shape=(16, 512), dtype=tf.int64, name=None),
                 'attention_mask': tf.TensorSpec(shape=(16, 512), dtype=tf.int64, name=None)},
                tf.TensorSpec(shape=(16, 5), dtype=tf.float64, name=None))

# load the training and validation sets
train_ds = tf.data.experimental.load('train', element_spec=element_spec)
val_ds = tf.data.experimental.load('val', element_spec=element_spec)

# view the input format
train_ds.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(16, 5), dtype=tf.float64, name=None))>

And now we train our model as usual.

In [15]:
model.fit(
    x=train_ds,
    validation_data=val_ds,
    epochs=3
)

Epoch 1/3


2023-01-03 15:44:56.130617: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


1574/8778 [====>.........................] - ETA: 5:21:32 - loss: 1.1775 - accuracy: 0.5491

KeyboardInterrupt: 

In [None]:
# Save the model

In [None]:
model.save('sentiment_model')