# Author Prediction - BERT

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
tf.__version__

2025-02-17 14:52:07.212658: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-17 14:52:07.215737: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-17 14:52:07.634684: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-17 14:52:08.526961: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'2.15.1'

## Data Preprocessing

### Importing the Dataset

In [2]:
import os

current_dir = os.path.dirname(os.path.realpath('__file__'))
root = os.path.join(current_dir, "Books")

data = []

# Loop through author and book folders, adding .txt file contents to data with matching labels
for author in os.listdir(root):
    author_path = os.path.join(root, author)

    for book in os.listdir(author_path):
        book_path = os.path.join(author_path, book)

        for chapter in os.listdir(book_path):
            chapter_path = os.path.join(book_path, chapter)
            
            with open(chapter_path, 'r', encoding='utf-8') as file:
                text = file.read()
                
            data.append({
                'Author': author,
                'Book': book,
                'Chapter': chapter[:-4],
                'Text': text
            })

dataset = pd.DataFrame(data)

### Splitting and Cleaning

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

fragment_size = 200
overlap = 50

# First we clean our text, converting to lower case and removing unwanted characted
# Then we apply corpus, simplifying our text
# Finally we split our text into fragments of 'fragment_size', with an overlap of 'overlap' words from the previous fragment

def preprocess_text(text, ps, all_stopwords):
    # Clean text
    text = text.lower()  # Lowercase
    text = re.sub(r'\n', " ", text)  # Newlines
    text = re.sub(r'[^a-zA-Z\s]', " ", text)  # Punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Extra spaces

    # Apply corpus
    words = text.split()
    words = [ps.stem(word) for word in words if word not in all_stopwords]
    processed_text = " ".join(words)

    return processed_text

def fragment_text(text, fragment_size, overlap):
    # Split text into fragments of fragment_size length, returns array of fragments
    words = text.split()
    current_text_fragments = []
    
    step_size = fragment_size - overlap  
    
    for i in range(0, len(words), step_size):
        current_fragment = " ".join(words[i:i + fragment_size])
        current_text_fragments.append(current_fragment)

        # Handle situation where final chapter fragment is already contained in the previous fragment
        if len(words) - i < fragment_size:
            break
        
    return current_text_fragments

In [4]:
ps = PorterStemmer()
nltk.download('stopwords')
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

# Apply our cleaning and create a new dataset to replace our previous one, this time with processed text
text_fragments = []
for index, row in dataset.iterrows():
    text = row["Text"]
    text = preprocess_text(text, ps, all_stopwords)
    current_text_fragments = fragment_text(text, fragment_size, overlap)
    
    for text_fragment in current_text_fragments:
        text_fragments.append({
            "Book": row["Book"],
            "Author": row["Author"],
            "Text": text_fragment
        })

# Convert the data fragments into a Pandas DataFrame and replace the original
dataset = pd.DataFrame(text_fragments)

[nltk_data] Downloading package stopwords to /home/lyons/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Encoding and Tokenisation

### Encoding

In [5]:
X = dataset["Text"].values
y = dataset["Author"].values

In [6]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Reshape y to a 2D array (needed for OneHotEncoder)
y_reshaped = y.reshape(-1, 1)

# Create and fit OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y_reshaped)

In [7]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Creating class weights (in order to reduce overfitting from imbalanced data size per author)
y_class_indices = np.argmax(y_encoded, axis=-1)

class_weights = compute_class_weight(
    'balanced', 
    classes=np.unique(y_class_indices), 
    y=y_class_indices
)

# Creating class weights dictionary
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

### Tokenisation

In [8]:
# Remove empty strings after .strip() applied
# There shouldn't be any but BERT tokeniser fails if not done
X = [x for x in X if x.strip() != '']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.2, random_state = 1)

In [10]:
from transformers import BertTokenizerFast

# Max length chosen based on token distribution coming from fragment size to minimise truncation
# Edit if changing fragment_size, or optionally remove  entirely though this should be less efficient
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

X_tokenized = tokenizer(
    X_train, 
    padding=True, 
    truncation=True,
    max_length=330,
    return_tensors='tf'
)

X_test_tokenized = tokenizer(
    X_test, 
    padding=True, 
    truncation=True,
    max_length=330,
    return_tensors='tf'
)

2025-02-17 14:53:02.687549: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-17 14:53:02.734598: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-17 14:53:02.734638: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-17 14:53:02.739449: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-17 14:53:02.739501: I external/local_xla/xla/stream_executor

## Creating and Training Model

In [11]:
from transformers import TFBertModel
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
from tensorflow.keras.models import Model

# Load pretrained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def create_model(learning_rate=2e-5, dropout_rate=0.3):
    # Define input layers
    input_ids = tf.keras.layers.Input(shape=(330,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(330,), dtype=tf.int32, name="attention_mask")
    
    # Define output
    bert_output = bert_model([input_ids, attention_mask])
    pooled_output = bert_output.pooler_output
    
    # Add dropout (prevent overfitting)
    dropout = Dropout(dropout_rate)(pooled_output)
    
    # Create classification layers
    num_authors = len(dataset['Author'].unique())
    output = Dense(num_authors, activation="softmax")(dropout)
    
    # # Create model
    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )

    return model

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [12]:
# Freeze all layers
for layer in bert_model.layers:
    layer.trainable = False

# Unfreeze only the last few layers (e.g., last 4)
for layer in bert_model.layers[-4:]:  
    layer.trainable = True

In [13]:
learning_rate = 1e-5
dropout_rate = 0.4

# Create the model
model = create_model(learning_rate=learning_rate, dropout_rate=dropout_rate)

# model.summary()

## Early Stopping

In [14]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',         # Monitor validation loss
    patience=3,                 # Stop after 3 epochs with no improvement
    restore_best_weights=True   # Restore model weights from the best epoch
)

In [15]:
# Set epochs lower if not using early stopping, observed some good results at 5+
# Converges at around 19 with seven authors, takes 1h30m on CPU
# Lower batch size if running on GPU and you get an out of memory error, 8 seems to work for 8gb VRAM
epochs = 50
batch_size = 8

# Label input ids and attention mask for BERT (from BERT tokenizer), for the sake of convienience
train_input_ids = X_tokenized['input_ids']
train_attention_mask = X_tokenized['attention_mask']
test_input_ids = X_test_tokenized['input_ids']
test_attention_mask = X_test_tokenized['attention_mask']

# Train the model
model.fit(
    [train_input_ids, train_attention_mask],
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([test_input_ids, test_attention_mask], y_test),
    callbacks=[early_stopping]
)

Epoch 1/50


2025-02-17 14:53:24.463399: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fb49878add0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-02-17 14:53:24.463427: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Ti, Compute Capability 8.6
2025-02-17 14:53:24.558007: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-02-17 14:53:24.661506: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1739804004.751582     608 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


<keras.src.callbacks.History at 0x7fb55e161b70>

## Evaluating Model

In [16]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict([test_input_ids, test_attention_mask])
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)  # Convert one-hot labels to class indices

print(confusion_matrix(y_true_classes, y_pred_classes))
print("\n")
print(classification_report(y_true_classes, y_pred_classes))

[[36  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 19  0  0  0  0  0  0  0  0  1  0]
 [ 0  0 18  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 27  0  0  0  0  0  0  0  0]
 [ 0  0  0  1 38  0  0  0  1  0  0  0]
 [ 0  0  0  1  0 13  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 66  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  7  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 19  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 33  1  0]
 [ 0  1  0  0  0  0  0  0  0  0 27  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 16]]


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       0.95      0.95      0.95        20
           2       1.00      1.00      1.00        18
           3       0.93      1.00      0.96        27
           4       1.00      0.95      0.97        40
           5       1.00      0.93      0.96        14
           6       1.00      1.00      1.00        66
           7       1.00      1.00      1.00         7
           8       0.95      1.00      0.9

In [17]:
loss, accuracy = model.evaluate(
    [train_input_ids, train_attention_mask],  # Model inputs
    y_train  # True labels
)

print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.005412133876234293
Test Accuracy: 1.0


### Sanity Check

In [18]:
# Testing model on data not contained within original dataset
tests_path = os.path.join(current_dir, "Tests")

for author in os.listdir(tests_path):
    test_file_path = os.path.join(tests_path, author)
    
    with open(test_file_path, 'r', encoding='utf-8') as file:
        new_text = file.read()

    processed_new_text = preprocess_text(new_text, ps, all_stopwords)
    
    X_test_tokenized = tokenizer(
        processed_new_text, 
        padding='max_length',
        truncation=True,
        max_length=330,
        return_tensors='tf'
    )

    # Make predictions on the test data
    predictions = model.predict([X_test_tokenized['input_ids'], X_test_tokenized['attention_mask']])
    
    # Get the predicted class
    predicted_classes = predictions.argmax(axis=-1)  # Get the index of the highest probability
    
    # Map to predicted class to author name
    author_names = dataset['Author'].unique()
    author_mapping = {index: author for index, author in enumerate(author_names)}
    predicted_authors = [author_mapping[i] for i in predicted_classes]
    
    print(f"Prediction: {predicted_authors} vs Actual: {author[:-4]}")

Prediction: ['Brandon Sanderson - Wheel of Time'] vs Actual: Brandon Sanderson - Wheel of Time
Prediction: ['Brandon Sanderson'] vs Actual: Brandon Sanderson
Prediction: ['Charles Dickens'] vs Actual: Charles Dickens
Prediction: ['F Scott Fitzgerald'] vs Actual: F Scott Fitzgerald
Prediction: ['Fyodor Dostoyevsky'] vs Actual: Fyodor Dostoyevsky
Prediction: ['Herman Melville'] vs Actual: Herman Melville
Prediction: ['James Joyce'] vs Actual: James Joyce
Prediction: ['Mark Twain'] vs Actual: Mark Twain
Prediction: ['Mary Shelley'] vs Actual: Mary Shelley
Prediction: ['Oscar Wilde'] vs Actual: Oscar Wilde
Prediction: ['Robert Jordan'] vs Actual: Robert Jordan
Prediction: ['Robert Louise Stevenson'] vs Actual: Robert Louise Stevenson


### Individual In Depth Analysis

In [19]:
def analysis(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        new_text = file.read()
    
    processed_new_text = preprocess_text(new_text, ps, all_stopwords)
    
    X_test_tokenized = tokenizer(
        processed_new_text, 
        padding='max_length',
        truncation=True,
        max_length=330,
        return_tensors='tf'
    )
    
    # Make predictions on the test data
    predictions = model.predict([X_test_tokenized['input_ids'], X_test_tokenized['attention_mask']])
    
    # Get the predicted class
    predicted_classes = predictions.argmax(axis=-1)  # Get the index of the highest probability
    
    # Map to predicted class to author name
    author_names = dataset['Author'].unique()
    author_mapping = {index: author for index, author in enumerate(author_names)}
    predicted_authors = [author_mapping[i] for i in predicted_classes]
    
    print(f"Prediction: {predicted_authors} vs Actual: {author[:-4]}")
    for i,v in enumerate(predictions[0]):
        print(f"{predictions[0][i]} - {author_names[i]}")

In [20]:
tests_path = os.path.join(current_dir, "Tests")
author = "Brandon Sanderson - Wheel of Time.txt"
test_file_path = os.path.join(tests_path, author)

analysis(test_file_path)

Prediction: ['Brandon Sanderson - Wheel of Time'] vs Actual: Brandon Sanderson - Wheel of Time
0.0013688578037545085 - Brandon Sanderson
0.9883411526679993 - Brandon Sanderson - Wheel of Time
0.00026189169147983193 - Charles Dickens
0.00015236025501508266 - F Scott Fitzgerald
0.0006222220254130661 - Fyodor Dostoyevsky
0.0011954059591516852 - Herman Melville
0.001646515796892345 - James Joyce
0.0010871358681470156 - Mark Twain
0.0002751499123405665 - Mary Shelley
0.0009100940660573542 - Oscar Wilde
0.0023368445690721273 - Robert Jordan
0.0018024229211732745 - Robert Louise Stevenson


In [21]:
epilogue_file_path = os.path.join(current_dir, "New Text/Misc.txt")
analysis(epilogue_file_path)

Prediction: ['F Scott Fitzgerald'] vs Actual: Brandon Sanderson - Wheel of Time
0.0052894349209964275 - Brandon Sanderson
0.0035756933502852917 - Brandon Sanderson - Wheel of Time
0.0012311025056988 - Charles Dickens
0.8391664028167725 - F Scott Fitzgerald
0.006988740060478449 - Fyodor Dostoyevsky
0.012365605682134628 - Herman Melville
0.0007439659093506634 - James Joyce
0.0018252100562676787 - Mark Twain
0.00465387525036931 - Mary Shelley
0.11554472893476486 - Oscar Wilde
0.0076397317461669445 - Robert Jordan
0.0009754406055435538 - Robert Louise Stevenson
