# Facebook Posts Sentiment Analysis Using Transformers (BERT)

Let's first install the transformers package, which I will use the BERT model to perform word embedding.

In [4]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.9 MB/s[0m eta [36m0:00:00[0m
[?25h

Import all required packages

In [5]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

from transformers import AutoTokenizer,TFBertModel

A function to formet the result.

In [None]:
def format_result(y_pred_prob: np.array, post_id: list):
    """Transform the result to the same format as required"""
    y_pred_matrix = (y_pred_prob == y_pred_prob.max(axis=1)[:,None]).astype(int)
    df_post_id = pd.DataFrame({'postId': post_id})
    df_pred = pd.DataFrame(y_pred_matrix, columns=['Appreciation_pred', 'Complaint_pred', 'Feedback_pred'])

    if df_post_id.shape[0] != df_pred.shape[0]:
        raise ValueError('The shapes of y_pred_prob and post_id do not match')

    return df_post_id.join(df_pred)

In [3]:
# Import training data
df = pd.read_csv('FB_posts_labeled.txt', sep='\t', header=0)
df = df.sample(frac=1).reset_index(drop=True)

label = np.argmax(df[['Appreciation', 'Complaint', 'Feedback']].values, axis=1)

df['label'] = label

df.head()

Unnamed: 0,postId,message,Appreciation,Complaint,Feedback,label
0,179590995428478_435543686499873,I've just lost a contract on a real estate pro...,0,1,0,1
1,152789358067261_350813731598155,The rewards program for debit cards is ending ...,0,1,0,1
2,8103318119_10150996333493120,joy n happiness,1,0,0,0
3,17648521247_10151266432531248,One my favorate stores!,1,0,0,0
4,6806028948_10151100419798949,You set an example to the other airlines - You...,1,0,0,0


Get a tokenizer for BERT model and the model itself to transform comments to vector representation

In [15]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert = TFBertModel.from_pretrained("bert-base-uncased")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Split the data into train and validation set

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df[['Appreciation', 'Complaint', 'Feedback']].values, stratify=df['label'])

Defind tokenizer objects for train and test dataset

In [12]:
x_train = tokenizer(
    text=X_train.tolist(),
    add_special_tokens=True,
    max_length=85,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

x_test = tokenizer(
    text=X_test.tolist(),
    add_special_tokens=True,
    max_length=85,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

Define neaural network layers for sentiment classification tasks. Here BERT is used as an embedding layer.

In [16]:
max_len = 85
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(3,activation = 'softmax')(out)  # Here we have 3 classes
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

Define the optimizer for training model

In [18]:
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss = CategoricalCrossentropy()
metric = CategoricalAccuracy('balanced_accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

Train the model

In [19]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y = y_train,
    validation_data = (
    {'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']}, y_test
    ),
  epochs=2,
    batch_size=36
)

Epoch 1/2




Epoch 2/2


After 2 epoch, although training accuracy is getting higher, the validation accuracy has not improved much.

However, for now I will continue with the current hyperparameter setting. Next I will use the entire tranining data to train the model.

In [20]:
# Here use all the data to train
x_train_all = tokenizer(
    text=df['message'].tolist(),
    add_special_tokens=True,
    max_length=85,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [21]:
bert = TFBertModel.from_pretrained("bert-base-uncased")  # Get the original bert model again

# Same setting for nearal networks layer
max_len = 85
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(3,activation = 'softmax')(out)  # Here we have 3 classes
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

# Same optimizer setting
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss = CategoricalCrossentropy()
metric = CategoricalAccuracy('balanced_accuracy'),

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Train the model using all training data
train_history = model.fit(
    x ={'input_ids':x_train_all['input_ids'],'attention_mask':x_train_all['attention_mask']} ,
    y = df[['Appreciation', 'Complaint', 'Feedback']].values,
  epochs=2,
    batch_size=36
)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/2




Epoch 2/2


### Predict unlabelled data

The processing step for unlabelled data is the same as above

In [22]:
df_test = pd.read_csv('FB_posts_unlabeled.txt', sep='\t', header=0)
df_test.head()

Unnamed: 0,postId,message
0,108381603303_10151119973393304,Love. It. To
1,115568331790246_371841206162956,NICE
2,115568331790246_515044031842672,Congrats
3,147285781446_10151010892176447,Awesome!
4,159616034235_10150639103634236,Award


In [23]:
x_unlab = tokenizer(
    text=df_test.message.tolist(),
    add_special_tokens=True,
    max_length=85,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

Making prediction

In [None]:
y_pred_prob = model.predict({'input_ids':x_unlab['input_ids'],'attention_mask':x_unlab['attention_mask']})



In [None]:
# Format the data set to match the accpeted format.
res_df = format_result(y_pred_prob, df_test.postId)

In [None]:
# Save as csv
res_df.to_csv('predictions.csv', index=False)

Here the result that I got from the prediction is 0.87 for average f1-score of all the 3 classes. 