# Author Prediction - BERT

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import torch
import os

2025-02-14 15:06:02.121145: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-14 15:06:02.124112: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-14 15:06:02.532276: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-14 15:06:03.407544: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Supress tensorflow warnings

## Data Preprocessing

### Importing the Dataset

In [3]:
dataset = pd.read_csv("Go Emotions Dataset.csv")
dataset.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

## Encoding and Tokenisation

### Encoding

In [4]:
X = dataset["text"].values
y = dataset.iloc[:, 9:].values

In [5]:
print(y)

[[0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 1 0 0]]


In [6]:
# Sum each emotion column
class_counts = y.sum(axis=0)

# Calculate class weights (one over frequency)
class_weights = 1 / (class_counts / class_counts.sum())  
class_weights /= class_weights.max()

# Convert to tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

### Tokenisation

In [7]:
# Remove empty strings, needed for BERT tokeniser
X = [x for x in X if x.strip() != '']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [9]:
from transformers import BertTokenizerFast

# Max length chosen based on data set size after tokenization
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

X_tokenized = tokenizer(
    X_train, 
    padding="max_length", 
    truncation=True,
    max_length=50,
    return_tensors='tf'
)

X_test_tokenized = tokenizer(
    X_test, 
    padding="max_length", 
    truncation=True,
    max_length=50,
    return_tensors='tf'
)

2025-02-14 15:06:53.689675: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-14 15:06:53.738179: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-14 15:06:53.738218: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-14 15:06:53.743977: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-14 15:06:53.744018: I external/local_xla/xla/stream_executor

## Creating and Training Model

In [10]:
from transformers import TFBertModel
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
from tensorflow.keras.models import Model

# Load pretrained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def create_model(learning_rate=2e-5, dropout_rate=0.3):
    # Define input layers
    input_ids = tf.keras.layers.Input(shape=(50,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(50,), dtype=tf.int32, name="attention_mask")
    
    # Define output
    bert_output = bert_model([input_ids, attention_mask])
    pooled_output = bert_output.pooler_output
    
    # Add dropout (prevent overfitting)
    dropout = Dropout(dropout_rate)(pooled_output)
    
    # Create classification layers
    go_emotion_options = 28
    output = Dense(go_emotion_options, activation="sigmoid")(dropout)
    
    # # Create model
    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss="binary_crossentropy",
        metrics=["AUC"]
    )

    return model

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [11]:
# Freeze all layers
for layer in bert_model.layers:
    layer.trainable = False

# Unfreeze only the last few layers (e.g., last 4)
for layer in bert_model.layers[-4:]:  
    layer.trainable = True

In [18]:
learning_rate = 5e-6
dropout_rate = 0.5

# Create the model
model = create_model(learning_rate=learning_rate, dropout_rate=dropout_rate)

## Early Stopping

In [14]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',         # Monitor validation loss
    patience=2,                 # Stop after 2 epochs with no improvement
    restore_best_weights=True   # Restore model weights from the best epoch
)

In [17]:
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",   # Watch validation loss
    factor=0.5,           # Reduce LR by half
    patience=2,           # Wait 2 epochs before reducing
    min_lr=1e-6,          # Don't go below 1e-6
    verbose=1             # Print updates
)

In [20]:
# Set epochs lower if not using early stopping, observed some good results at 5+
# Converges at around 19 with seven authors, takes 1h30m on CPU
# Lower batch size if running on GPU and you get an out of memory error, 8 seems to work for 8gb VRAM
epochs = 6
batch_size = 8

# Label input ids and attention mask for BERT (from BERT tokenizer), for the sake of convienience
train_input_ids = X_tokenized['input_ids']
train_attention_mask = X_tokenized['attention_mask']
test_input_ids = X_test_tokenized['input_ids']
test_attention_mask = X_test_tokenized['attention_mask']

# Train the model
model.fit(
    [train_input_ids, train_attention_mask],
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([test_input_ids, test_attention_mask], y_test),
    callbacks=[early_stopping, lr_scheduler]
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 5: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-06.


<keras.src.callbacks.History at 0x7f30583b7a00>

## Evaluating Model

In [21]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict([test_input_ids, test_attention_mask])
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)  # Convert one-hot labels to class indices

print(confusion_matrix(y_true_classes, y_pred_classes))
print("\n")
print(classification_report(y_true_classes, y_pred_classes))

[[ 632   42   17   24   57   15    7   17    4    7   16   12    4   15
     5   76    0   32   74    1   30    0    1    0    3   11   32  256]
 [  17  395    6   11    6    4    3    3    3    2    6    4    1    3
     2   15    0   18    8    0    6    0    1    0    2    5   11   58]
 [   7    5  185   85    9    7    3   10    6   10   22   20    4    1
     6    4    0    3    2    0    8    0    0    0    0   13    4  102]
 [  11   19   59  146   27   17   21   15    2   20   54   29    8    3
    11   11    0    6   17    0    8    0    1    0    3   22   15  229]
 [  81   19    5   24  192   25   14    8    2   10   35    8    1   10
    14   31    0   31   40    0   37    0    6    0    6   13    4  408]
 [  10    1    5    5    7   93    3    4    4    1   10    0    2    1
     1   13    0   17    8    0   30    0    0    0    7   21    1   89]
 [   9    6    4   10    9    1  143   73    0   10   13    4    0    3
     2    7    0    1    6    0    7    0    2    0    5  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
loss, accuracy = model.evaluate(
    [train_input_ids, train_attention_mask],  # Model inputs
    y_train  # True labels
)

print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.10440154373645782
Test Accuracy: 0.9260385036468506


### Sanity Check

In [29]:
def analysis(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        new_text = file.read()
    
    new_text_tokenized = tokenizer(
        new_text, 
        padding="max_length", 
        truncation=True,
        max_length=50,
        return_tensors='tf'
    )
    
    # Make predictions on the test data
    predictions = model.predict([new_text_tokenized['input_ids'], new_text_tokenized['attention_mask']])
    
    # Since we're predicting one sample, extract the first (and only) result
    predicted_probabilities = predictions[0]
    print("Predicted probabilities:", predicted_probabilities)
    
    # If you have a dictionary mapping indices to emotion names, use it to list the predicted emotions.
    # For example, assuming a mapping like this:
    emotion_labels = {
        0: "admiration", 1: "amusement", 2: "anger", 3: "annoyance",
        4: "approval", 5: "caring", 6: "confusion", 7: "curiosity",
        8: "desire", 9: "disappointment", 10: "disapproval", 11: "disgust",
        12: "embarrassment", 13: "excitement", 14: "fear", 15: "gratitude",
        16: "grief", 17: "joy", 18: "love", 19: "nervousness",
        20: "optimism", 21: "pride", 22: "realization", 23: "relief",
        24: "remorse", 25: "sadness", 26: "surprise", 27: "neutral"
    }
    
    top5_indices = np.argsort(predicted_probabilities)[::-1][:5]
    
    print("Top 5 Emotions:")
    for idx in top5_indices:
        emotion_name = emotion_labels[idx]
        probability_percent = predicted_probabilities[idx] * 100
        print(f"{emotion_name}: {probability_percent:.2f}%")

In [33]:
selected_file = "Wheel of Time - Epilogue"

def load_text(path):
    current_dir = os.path.dirname(os.path.realpath('__file__'))
    file_path = os.path.join(current_dir, path)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

file = "New Text/" + selected_file + ".txt"

analysis(file)

Predicted probabilities: [0.00574055 0.00363898 0.01780611 0.04828649 0.02478977 0.00777834
 0.03122058 0.00708699 0.00913511 0.10208075 0.02524089 0.01111242
 0.01050831 0.01088709 0.01747372 0.00672397 0.00718896 0.00211868
 0.0009797  0.01384205 0.01746555 0.0025071  0.04839845 0.00713099
 0.00711925 0.03564068 0.02668269 0.43117535]
Top 5 Emotions:
neutral: 43.12%
disappointment: 10.21%
realization: 4.84%
annoyance: 4.83%
sadness: 3.56%
