## Transformer model (not pre-trained)

### Library imports

In [2]:
import tensorflow as tf
import pickle
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from google.colab import drive, runtime
drive.mount('/content/drive')

Mounted at /content/drive


### Data structure definition

In [None]:
base_corpus = "doyle"

#directory from Google Colab, where model was trained
dir = f"/content/drive/MyDrive/AASD4011 Models/JA DoyleX"

#Output file paths and extensions
filepaths = {
    "data": f"{dir}/{base_corpus}_preprocessed_data",
    "model": f"{dir}/doyleX_model",
    "batch": f"{dir}/doyleX_batch_losses",
    "history": f"{dir}/doyleX_training_history",
    "checkpoint": f"{dir}/doyleX_weights"
}
extensions = {
    "data": ".pkl",
    "model": ".keras",
    "batch": ".npy",
    "history": ".pkl",
    "checkpoint": ".weights.h5"
}
data_filepath = filepaths["data"] + extensions["data"]
model_filepath = filepaths["model"] + extensions["model"]
batch_filepath = filepaths["batch"] + extensions["batch"]
history_filepath = filepaths["history"] + extensions["history"]
checkpoint_filepath = filepaths["checkpoint"] + extensions["checkpoint"]


Data file found at /content/drive/MyDrive/AASD4011 Models/JA DoyleX/doyle_preprocessed_data.pkl.
Write test succeeded for: /content/drive/MyDrive/AASD4011 Models/JA DoyleX/doyle_preprocessed_data_test.pkl
Write test succeeded for: /content/drive/MyDrive/AASD4011 Models/JA DoyleX/doyleX_model_test.keras
Write test succeeded for: /content/drive/MyDrive/AASD4011 Models/JA DoyleX/doyleX_batch_losses_test.npy
Write test succeeded for: /content/drive/MyDrive/AASD4011 Models/JA DoyleX/doyleX_training_history_test.pkl


### Colab Tests and validation

Check file I/O before beginning training, to avoid wasting resources

In [None]:
#Check for data file existence
import os
if not os.path.exists(data_filepath):
    raise FileNotFoundError(f"Data file not found at {data_filepath}.")
else:
    print(f"Data file found at {data_filepath}.")

#Check that a file can be saved to model_filepath and history_filepath
for path in [filepaths["data"] + "_test" + extensions["data"],
             filepaths["model"] + "_test" + extensions["model"],
             filepaths["batch"] + "_test" + extensions["batch"],
             filepaths["history"] + "_test" + extensions["history"]]:
    try:
        with open(path, "w") as f:
            f.write("test")
        os.remove(path)
        print(f"Write test succeeded for: {path}")
    except Exception as e:
        print(f"Write test FAILED for: {path} -- {e}")

### Data loading

In [None]:
#Load preprocessed data from subdirectory
with open(data_filepath, "rb") as f:
    data = pickle.load(f)
    X_train = data["X_train"]
    y_train = data["y_train"]
    word_to_id = data["word_to_id"]
    id_to_word = data["id_to_word"]
    max_seq_length = data["max_seq_length"]
    attention_masks = data["attention_masks"] if "attention_masks" in data else None

X_train = np.array(X_train)
y_train = np.array(y_train)
attention_masks = np.array(attention_masks)

vocab_size = len(word_to_id)
print(f"Vocabulary size: {vocab_size}")
print(f"Max sequence length: {max_seq_length}")

Vocabulary size: 40462
Max sequence length: 40


### Transformer (non-GPT) definition

In [None]:
# Positional Encoding Layer (as defined in the original notebook)
class PositionalEncoding(layers.Layer):
    """
    Adds positional encoding to the input embeddings.
    Uses sine for even indices and cosine for odd indices of the embedding dimension.
    Reference: "Attention is All You Need" (Vaswani et al., 2017).
    """
    def __init__(self, max_len, d_model):
        super().__init__()
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        angle_rads = pos * angle_rates
        pos_encoding = np.zeros((max_len, d_model))
        pos_encoding[:, 0::2] = np.sin(angle_rads[:, 0::2])
        pos_encoding[:, 1::2] = np.cos(angle_rads[:, 1::2])
        self.pos_encoding = tf.constant(pos_encoding[np.newaxis, ...], dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]

# Model Hyperparameters
d_model = 128
num_heads = 4
ff_dim = 256
dropout_rate = 0.1
#vocab size defined above
#max_seq_length defined above

# Model Definition
inputs = keras.Input(shape=(max_seq_length,), dtype="int32")
mask_inputs = keras.Input(shape=(max_seq_length,), dtype="int32")

# Embedding and Positional Encoding
embedding_layer = layers.Embedding(input_dim=vocab_size, output_dim=d_model, mask_zero=True)
x = embedding_layer(inputs)
x = PositionalEncoding(max_seq_length, d_model)(x)

# Transformer Block
def expand_mask(m): 
    #Import statements needed for inference step
    import tensorflow as tf
    return tf.expand_dims(m, axis=1)
expanded_mask = layers.Lambda(expand_mask, output_shape=(None, 1, max_seq_length))(mask_inputs)
attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(
    x, x, attention_mask=expanded_mask
)
attn_output = layers.Dropout(dropout_rate)(attn_output)
attn_output = layers.LayerNormalization(epsilon=1e-6)(x + attn_output)

ffn = keras.Sequential([
    layers.Dense(ff_dim, activation="relu"),
    layers.Dense(d_model),
])
ffn_output = ffn(attn_output)
ffn_output = layers.Dropout(dropout_rate)(ffn_output)
sequence_output = layers.LayerNormalization(epsilon=1e-6)(attn_output + ffn_output)

# Output layer
outputs = layers.Dense(vocab_size, activation="softmax")(sequence_output)

# For language modeling, we predict the next token at each position
model = keras.Model([inputs, mask_inputs], outputs)

model.compile(
    optimizer="adam",
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=["accuracy"]
)

model.summary() #Print out model layers summary

#Callback 1: Custom history
class BatchLossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs=None):
        self.losses = []

    def on_batch_end(self, batch, logs=None):
        self.losses.append(logs.get('loss'))

# Instantiate custom callback
batch_history = BatchLossHistory()

#Callback 2: Checkpointing
from tensorflow.keras.callbacks import ModelCheckpoint

model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=False, # Set to True to save only the best model
    save_freq='epoch'
)

#Exclude <PAD> from loss function to avoid simply predicting <PAD>
pad_token_id = word_to_id["<PAD>"]
sample_weights = (y_train != pad_token_id).astype(np.float32)

# Train the model
history = model.fit(
    [X_train, attention_masks],
    y_train,
    sample_weight=sample_weights,
    batch_size=128,
    epochs=20,
    validation_split=0.1,
    callbacks=[batch_history, model_checkpoint_callback]
)
# Save the model
model.save(model_filepath)

#Save the training history to subdirectory
with open(history_filepath, "wb") as f:
    pickle.dump(history.history, f)

#Save batch history
batch_loss_list = batch_history.losses
batch_loss_array = np.array(batch_loss_list)
np.save(batch_filepath, batch_loss_array)

#Disconnect and delete Colab runtime to save resources
runtime.unassign()



Epoch 1/20
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 113ms/step - accuracy: 0.2013 - loss: 1.9378 - val_accuracy: 0.4444 - val_loss: 0.3078
Epoch 2/20
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 112ms/step - accuracy: 0.4348 - loss: 0.1989 - val_accuracy: 0.4640 - val_loss: 0.1541
Epoch 3/20
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 113ms/step - accuracy: 0.4481 - loss: 0.0817 - val_accuracy: 0.4684 - val_loss: 0.1231
Epoch 4/20
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 112ms/step - accuracy: 0.4536 - loss: 0.0432 - val_accuracy: 0.4713 - val_loss: 0.1070
Epoch 5/20
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 112ms/step - accuracy: 0.4559 - loss: 0.0228 - val_accuracy: 0.4724 - val_loss: 0.1018
Epoch 6/20
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 112ms/step - accuracy: 0.4587 - loss: 0.0131 - val_accuracy: 0.4728 - val_loss:

Note: During inference, this from-scratch Tensorflow model performed poorly, with more grammatical errors than LSTM models.  There may be insufficient data for a Transformer model, or some more significant error with the approach used.

## Pretrained GPT-2 models with fine-tuning using our datasets 