In [1]:
import tensorflow as tf
import pickle
import numpy as np

from shared_project_functions import get_target_subdirectory

2025-09-20 23:36:45.975871: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
base_corpus = "doyleX"

dir = get_target_subdirectory(base_corpus, subdir_string="model")

#Load preprocessed data from subdirectory
with open(f"{dir}/{base_corpus}_preprocessed_data.pkl", "rb") as f:
    data = pickle.load(f)
    X_train = data["X_train"]
    y_train = data["y_train"]
    word_to_id = data["word_to_id"]
    id_to_word = data["id_to_word"],
    max_seq_length = data["max_seq_length"]

In [3]:
import numpy as np

# Convert X_train to numpy array if not already
X_train = np.array(X_train)
y_train = np.array(y_train)

# Ensure X_train is 2D and padded to max_seq_length
if X_train.ndim == 1 or X_train.shape[1] != max_seq_length:
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    X_train = pad_sequences(X_train, maxlen=max_seq_length, padding='post', value=word_to_id["<PAD>"])

# Create attention mask: 1 for non-pad, 0 for pad
attention_mask = (X_train != word_to_id["<PAD>"]).astype(np.int32)

# Save for later use
np.savez_compressed(
    f"{dir}/{base_corpus}_transformer_data.npz",
    X_train=X_train,
    y_train=y_train,
    attention_mask=attention_mask,
    max_seq_length=max_seq_length
)

print(f"Saved padded X_train, y_train, and attention_mask to {dir}/{base_corpus}_transformer_data.npz")

Saved padded X_train, y_train, and attention_mask to model_4_doyleX/doyleX_transformer_data.npz


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load data (if not already in memory)
data = np.load(f"{dir}/{base_corpus}_transformer_data.npz")
X_train = data["X_train"]
y_train = data["y_train"]
attention_mask = data["attention_mask"]
max_seq_length = int(data["max_seq_length"])
vocab_size = len(word_to_id)

# Positional Encoding Layer
class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        angle_rads = pos * angle_rates
        pos_encoding = np.zeros((max_len, d_model))
        pos_encoding[:, 0::2] = np.sin(angle_rads[:, 0::2])
        pos_encoding[:, 1::2] = np.cos(angle_rads[:, 1::2])
        self.pos_encoding = tf.constant(pos_encoding[np.newaxis, ...], dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]

# Model Hyperparameters
d_model = 128
num_heads = 4
ff_dim = 256
dropout_rate = 0.1

# Model Definition
inputs = keras.Input(shape=(max_seq_length,), dtype="int32")
mask_inputs = keras.Input(shape=(max_seq_length,), dtype="int32")

embedding_layer = layers.Embedding(input_dim=vocab_size, output_dim=d_model, mask_zero=True)
x = embedding_layer(inputs)
x = PositionalEncoding(max_seq_length, d_model)(x)

# Transformer Block
attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(x, x, attention_mask=tf.expand_dims(mask_inputs, axis=1))
attn_output = layers.Dropout(dropout_rate)(attn_output)
attn_output = layers.LayerNormalization(epsilon=1e-6)(x + attn_output)

ffn = keras.Sequential([
    layers.Dense(ff_dim, activation="relu"),
    layers.Dense(d_model),
])
ffn_output = ffn(attn_output)
ffn_output = layers.Dropout(dropout_rate)(ffn_output)
sequence_output = layers.LayerNormalization(epsilon=1e-6)(attn_output + ffn_output)

# Output layer
outputs = layers.Dense(vocab_size, activation="softmax")(sequence_output)

# For language modeling, we predict the next token at each position
model = keras.Model([inputs, mask_inputs], outputs)

model.compile(
    optimizer="adam",
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=["accuracy"]
)

model.summary()

# Prepare y_train for categorical prediction at each timestep
y_train_seq = np.expand_dims(y_train, -1)  # shape: (num_samples, 1)
y_train_seq = np.tile(y_train_seq, (1, max_seq_length))  # shape: (num_samples, max_seq_length)

# Train the model
history = model.fit(
    [X_train, attention_mask],
    y_train_seq,
    batch_size=64,
    epochs=10,
    validation_split=0.1
)
# Save the model
model.save(f"doyleX_model.keras")

#Save the training history to subdirectory
with open(f"doyleX_training_history.pkl", "wb") as f:
    pickle.dump(history.history, f)