In [1]:
!pip install -q datasets transformers

import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer

[0m

2025-04-06 18:51:42.169984: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9373] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-06 18:51:42.170033: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-06 18:51:42.170986: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1534] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-06 18:51:42.176988: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as not

In [2]:
dataset = load_dataset("stanfordnlp/sst2")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [3]:
def tokenize_fn(example):
    return tokenizer(
        example["sentence"], truncation=True, padding="max_length", max_length=128
    )

tokenized = dataset.map(tokenize_fn, batched=True)
tokenized.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])


Map: 100%|██████████| 872/872 [00:00<00:00, 4451.03 examples/s]


In [4]:
def to_tf(ds, batch_size=32, shuffle=True):
    features = {
        "input_ids": tf.TensorSpec(shape=(None,), dtype=tf.int32),
        "attention_mask": tf.TensorSpec(shape=(None,), dtype=tf.int32),
    }
    generator = lambda: (
        ({"input_ids": x["input_ids"], "attention_mask": x["attention_mask"]}, x["label"])
        for x in ds
    )
    tf_dataset = tf.data.Dataset.from_generator(generator, output_signature=(features, tf.TensorSpec(shape=(), dtype=tf.int64)))
    if shuffle:
        tf_dataset = tf_dataset.shuffle(1000)
    return tf_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_ds = to_tf(tokenized["train"])
val_ds = to_tf(tokenized["validation"], shuffle=False)

2025-04-06 18:51:46.820926: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-04-06 18:51:46.830198: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-04-06 18:51:46.830234: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-04-06 18:51:46.834589: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-04-06 18:51:46.834630: I external/local_xla/xla/stream_executor

In [5]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embed_dim, max_length):
        super().__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = tf.keras.layers.Embedding(input_dim=max_length, output_dim=embed_dim)

    def call(self, x):
        positions = tf.range(start=0, limit=tf.shape(x)[-1], delta=1)
        return self.token_emb(x) + self.pos_emb(positions)

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate=0.1):
        super().__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation='relu'),
            tf.keras.layers.Dense(embed_dim),
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization()
        self.layernorm2 = tf.keras.layers.LayerNormalization()
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        out1 = self.layernorm1(inputs + self.dropout1(attn_output, training=training))
        ffn_output = self.ffn(out1)
        return self.layernorm2(out1 + self.dropout2(ffn_output, training=training))

In [6]:
def build_transformer_model(vocab_size, max_len=128, embed_dim=256, num_heads=4, ff_dim=1024, num_layers=4):
    input_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    x = PositionalEmbedding(vocab_size, embed_dim, max_len)(input_ids)

    for _ in range(num_layers):
        x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)

    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    outputs = tf.keras.layers.Dense(2)(x)

    return tf.keras.Model(inputs={"input_ids": input_ids, "attention_mask": attention_mask}, outputs=outputs)

In [7]:
vocab_size = tokenizer.vocab_size
model = build_transformer_model(vocab_size=vocab_size)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

model.fit(train_ds, validation_data=val_ds, epochs=3)

Epoch 1/3


2025-04-06 18:51:58.071380: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:467] Loaded cuDNN version 90000
2025-04-06 18:51:58.367493: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f5e48af3e60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-04-06 18:51:58.367539: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3070 Laptop GPU, Compute Capability 8.6
2025-04-06 18:51:58.374346: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1743965518.447091    3711 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f5f03baab90>