# Author Prediction - BERT

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_addons as tfa
import torch
import os

2025-02-15 19:36:36.706294: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-15 19:36:36.708338: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-15 19:36:36.954476: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-15 19:36:37.455515: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and in

In [2]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Supress tensorflow warnings

## Data Preprocessing

### Importing the Dataset

In [3]:
dataset = pd.read_csv("EmoBank Dataset.csv")
dataset.columns

Index(['id', 'split', 'V', 'A', 'D', 'text'], dtype='object')

In [4]:
dataset = dataset[dataset['text'].notna()]

## Encoding and Tokenisation

### Encoding

In [5]:
X = dataset["text"].values
y = dataset.iloc[:, 2:5].values

### Tokenisation

In [6]:
# Remove empty strings, needed for BERT tokeniser
X = [x for x in X if x.strip() != '']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [8]:
print("Mean VAD scores:", np.mean(y_train, axis=0))
print("Min VAD scores:", np.min(y_train, axis=0))
print("Max VAD scores:", np.max(y_train, axis=0))

Mean VAD scores: [2.97921223 3.04346918 3.06416377]
Min VAD scores: [1.2 1.8 2. ]
Max VAD scores: [4.6 4.4 4.2]


In [9]:
from transformers import BertTokenizerFast

# Max length chosen based on data set size after tokenization
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

X_tokenized = tokenizer(
    X_train, 
    padding="max_length", 
    truncation=True,
    max_length=70,
    return_tensors='tf'
)

X_test_tokenized = tokenizer(
    X_test, 
    padding="max_length", 
    truncation=True,
    max_length=70,
    return_tensors='tf'
)

2025-02-15 19:37:20.483467: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-15 19:37:20.509838: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-15 19:37:20.509881: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-15 19:37:20.513259: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-15 19:37:20.513302: I external/local_xla/xla/stream_executor

## Creating and Training Model

In [10]:
import tensorflow as tf

def weighted_mse(y_true, y_pred):
    # Compute element-wise squared errors; shape: (batch_size, 3)
    errors = tf.square(y_true - y_pred)
    
    # Compute the mean of y_true for each output dimension; shape: (1, 3)
    mean_y_true = tf.reduce_mean(y_true, axis=0, keepdims=True)
    
    # Compute weights for each element: 1 + abs(y_true - mean_y_true)
    # This gives a tensor of shape: (batch_size, 3)
    weights = 1 + tf.abs(y_true - mean_y_true)

    # Multiply element-wise errors by weights
    weighted_errors = errors * weights  # shape: (batch_size, 3)
    
    # Return the mean of all weighted errors as a scalar
    return tf.reduce_mean(weighted_errors)

In [11]:
from transformers import TFBertModel
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
from tensorflow.keras.models import Model

# Load pretrained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def create_model(learning_rate=2e-5, dropout_rate=0.3):
    # Define input layers
    input_ids = tf.keras.layers.Input(shape=(70,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(70,), dtype=tf.int32, name="attention_mask")
    
    # Define output
    bert_output = bert_model([input_ids, attention_mask])
    pooled_output = bert_output.pooler_output
    
    # Add dropout (prevent overfitting)
    dropout = Dropout(dropout_rate)(pooled_output)

    # Create classification layers
    emoBank_options = 3
    output = Dense(emoBank_options, activation="linear")(dropout)
    
    # # Create model
    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=weighted_mse,
        metrics=["mse", "mae"]
    )

    return model

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [12]:
# Freeze all layers
for layer in bert_model.layers:
    layer.trainable = False

# Unfreeze only the last few layers (e.g., last 4)
for layer in bert_model.layers[:1]:  
    layer.trainable = True

In [13]:
# Choose between a fixed learning rate and CLR (cyclical, bounces between two rates specified)
learning_rate = 3e-5
# learning rate = tfa.optimizers.CyclicalLearningRate(
#     initial_learning_rate=3e-6,   # Minimum learning rate
#     maximal_learning_rate=3e-5,   # Maximum learning rate
#     step_size=2000,               # Steps to reach max_lr before decreasing
#     scale_fn=lambda x: 1 / (2.0 ** (x - 1))  # Scaling function (triangular2 policy)
# )

dropout_rate = 0.3

# Create the model
model = create_model(learning_rate=learning_rate, dropout_rate=dropout_rate)

## Early Stopping

In [14]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',         # Monitor validation loss
    patience=3,                 # Stop after 2 epochs with no improvement
    restore_best_weights=True   # Restore model weights from the best epoch
)

In [15]:
epochs = 25
batch_size = 32

# Label input ids and attention mask for BERT (from BERT tokenizer), for the sake of convienience
train_input_ids = X_tokenized['input_ids']
train_attention_mask = X_tokenized['attention_mask']
test_input_ids = X_test_tokenized['input_ids']
test_attention_mask = X_test_tokenized['attention_mask']

# Train the model
model.fit(
    [train_input_ids, train_attention_mask],
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([test_input_ids, test_attention_mask], y_test),
    callbacks=[early_stopping]
)

Epoch 1/25


2025-02-15 19:37:42.182112: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f4f438d6b60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-02-15 19:37:42.182139: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Ti, Compute Capability 8.6
2025-02-15 19:37:42.232833: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-02-15 19:37:42.285492: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1739648262.354705   84249 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25


<keras.src.callbacks.History at 0x7f5082dce260>

## Evaluating Model

In [17]:
result = model.evaluate(
    [train_input_ids, train_attention_mask],  # Model inputs
    y_train  # True labels
)

print(f"Test Loss: {result[0]}")
print(f"Test MSE: {result[1]}")
print(f"Test MAE: {result[2]}")

Test Loss: 0.03597669675946236
Test MSE: 0.027147093787789345
Test MAE: 0.12716332077980042


### Sanity Check

In [19]:
def analysis(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        new_text = file.read()
    
    new_text_tokenized = tokenizer(
        new_text, 
        padding="max_length", 
        truncation=True,
        max_length=50,
        return_tensors='tf'
    )
    
    # Make predictions on the test data
    predictions = model.predict([new_text_tokenized['input_ids'], new_text_tokenized['attention_mask']])
    
    # Since we're predicting one sample, extract the first (and only) result
    predicted_probabilities = predictions[0]
    print("Valence: ", predicted_probabilities[0])
    print("Arousal: ", predicted_probabilities[1])
    print("Dominance: ", predicted_probabilities[2])

In [21]:
selected_file = "Veins of Gold"

def load_text(path):
    current_dir = os.path.dirname(os.path.realpath('__file__'))
    file_path = os.path.join(current_dir, path)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

file = "New Text/" + selected_file + ".txt"

analysis(file)

Valence:  2.94784
Arousal:  3.0081213
Dominance:  2.984594


In [18]:
import tensorflow as tf

# Define sanity check examples with expected outcomes
test_sentences = {
    "High Valence": "I feel absolutely wonderful today! Everything is going perfectly.",
    "High Arousal": "I can't believe it! This is the most exciting moment of my life!",
    "High Dominance": "I am the leader here. Everyone follows my commands.",
    "Low Valence": "Everything is falling apart. I feel empty and hopeless.",
    "Low Arousal": "It's just another slow and uneventful day at work.",
    "Low Dominance": "I feel so small and helpless in this overwhelming situation.",
    "High VAD (Excited Power)": "I just won the championship! I feel unstoppable!",
    "Low Valence, High Arousal, Low Dominance": "I'm trapped and panicking! There's no escape!"
}

def sanity_check_model(model, tokenizer, test_sentences):
    for label, sentence in test_sentences.items():
        # Tokenize the input sentence
        encoded_input = tokenizer(
            sentence,
            padding="max_length",
            truncation=True,
            max_length=50,
            return_tensors="tf"
        )
        
        # Run prediction
        prediction = model.predict([encoded_input["input_ids"], encoded_input["attention_mask"]])
        
        # Extract scores
        valence, arousal, dominance = prediction[0]  # Unpack first prediction

        # Print results
        print(f"📝 **{label}**")
        print(f"   🟢 Valence:  {valence:.4f}")
        print(f"   🔴 Arousal:  {arousal:.4f}")
        print(f"   🔵 Dominance: {dominance:.4f}\n")
        
# Run the sanity check
sanity_check_model(model, tokenizer, test_sentences)


📝 **High Valence**
   🟢 Valence:  4.2690
   🔴 Arousal:  3.8780
   🔵 Dominance: 3.4783

📝 **High Arousal**
   🟢 Valence:  4.3619
   🔴 Arousal:  4.4137
   🔵 Dominance: 3.4335

📝 **High Dominance**
   🟢 Valence:  3.1962
   🔴 Arousal:  3.1353
   🔵 Dominance: 3.3430

📝 **Low Valence**
   🟢 Valence:  2.0306
   🔴 Arousal:  3.1616
   🔵 Dominance: 2.4815

📝 **Low Arousal**
   🟢 Valence:  2.8379
   🔴 Arousal:  2.8326
   🔵 Dominance: 2.8743

📝 **Low Dominance**
   🟢 Valence:  2.0552
   🔴 Arousal:  3.2458
   🔵 Dominance: 2.6137

📝 **High VAD (Excited Power)**
   🟢 Valence:  3.4171
   🔴 Arousal:  4.2227
   🔵 Dominance: 3.5426

📝 **Low Valence, High Arousal, Low Dominance**
   🟢 Valence:  2.2353
   🔴 Arousal:  3.9827
   🔵 Dominance: 3.1016

