## BERT Transfer-learning

On this version, I'm starting from the TFDistilBertModel pre-trained model. Here, I'm freezing the weights from the original model, then averaging the embeddings from the last hidden state and use them as input for a shallow classifier. This appraoch didn't perform as well as fine-tuning.

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, TFAutoModel

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
#Load datasets and pre-trained models.
train_dataset = pd.read_csv('/dataset/train_clean.csv',index_col=False,encoding='utf-8')
test_dataset = pd.read_csv('/dataset/test_clean.csv',index_col=False,encoding='utf-8')
val_dataset = pd.read_csv('/dataset/val_clean.csv',index_col=False,encoding='utf-8')

# train_dataset = pd.read_csv('/dataset/train_balanced_clean.csv',index_col=False,encoding='utf-8')
# test_dataset = pd.read_csv('/dataset/test_balanced_clean.csv',index_col=False,encoding='utf-8')
# val_dataset = pd.read_csv('/dataset/val_balanced_clean.csv',index_col=False,encoding='utf-8')

bert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
bert_base_model = TFAutoModel.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [3]:
#Extract token_ids and attention_masks. Then format into BERT input.
emoji_label = {'sadness': 0,
               'anger': 1,
               'joy': 2,
               'love': 3,
               'surprise': 4,
               'fear': 5}

def process_datasets(df_dataset,batches = 32):
    tokens = bert_tokenizer(text=df_dataset['text'].tolist(),
                                  add_special_tokens=True,
                                  max_length=64,
                                  truncation=True,
                                  padding=True,
                                  return_tensors='tf',
                                  return_token_type_ids=False,
                                  return_attention_mask=True,
                                  verbose=True)
    labels = tf.keras.utils.to_categorical([emoji_label[e] for e in df_dataset.emoji.tolist()])
    tf_dataset = tf.data.Dataset.from_tensor_slices((tokens['input_ids'],tokens['attention_mask'],labels)).batch(batches)
    return tf_dataset.map(lambda id,mask,label: ({'input_ids':id, 'attention_mask': mask},label))

tf_trainset = process_datasets(train_dataset)
tf_testset = process_datasets(test_dataset,1)
tf_valset = process_datasets(val_dataset)

In [4]:
#Model instantiation
bert_base_model.trainable = False

input_ids = tf.keras.layers.Input(shape=(64,), name="input_ids", dtype='int32')
input_attention_mask = tf.keras.layers.Input(shape=(64), name="attention_mask", dtype='int32')
bert_emb = bert_base_model(input_ids, attention_mask = input_attention_mask)[0] #Get last hidden layer (embeddings for all tokens)
out = tf.keras.layers.GlobalAveragePooling1D()(bert_emb) #Reduce dimension by average poolin.
out = tf.keras.layers.Dense(32, activation='relu')(out)
out = tf.keras.layers.Dropout(0.2)(out)
y = tf.keras.layers.Dense(6,activation='softmax')(out)
model = tf.keras.Model(inputs=[input_ids,input_attention_mask], outputs=y)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 64)]                 0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 64)]                 0         []                            
 )                                                                                                
                                                                                                  
 tf_distil_bert_model (TFDi  TFBaseModelOutput(last_hid   6636288   ['input_ids[0][0]',           
 stilBertModel)              den_state=(None, 64, 768),   0          'attention_mask[0][0]']      
                              hidden_states=None, atten                                       

In [6]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=5)
model.compile(optimizer=tf.keras.optimizers.Adam(.001),
             loss=tf.keras.losses.CategoricalCrossentropy(),
             metrics=[tf.keras.metrics.CategoricalAccuracy('balanced_accuracy')])

In [7]:
history = model.fit(tf_trainset, validation_data=tf_valset, epochs=50, callbacks=[stop_early])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50


In [8]:
#Evaluate model with test set.
predicted = model.predict(tf_testset)
predicted = np.argmax(predicted, axis=1)



In [9]:
y_test = np.array([np.argmax(label[1]) for label in tf_testset.as_numpy_iterator()])
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.65      0.73      0.69       581
           1       0.62      0.45      0.53       275
           2       0.73      0.82      0.77       695
           3       0.47      0.30      0.37       159
           4       0.54      0.20      0.29        66
           5       0.57      0.59      0.58       224

    accuracy                           0.66      2000
   macro avg       0.60      0.52      0.54      2000
weighted avg       0.64      0.66      0.64      2000

