In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from src import tensorfi2 as tfi

# 1. 下载 Tatoeba 英法并只取前 1% 样本以加速实验
raw = load_dataset("tatoeba", lang1="en", lang2="fr", split="train[:1%]")  # Tatoeba 英法平行语料 :contentReference[oaicite:4]{index=4}

# 2. 初始化分词器
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# 3. 预处理：添加翻译前缀，tokenize，并将 padding token 转为 -100 以忽略
def preprocess(examples):
    inputs  = ["translate English to French: " + ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    mi = tokenizer(inputs,  max_length=64, truncation=True, padding="max_length")
    lbl = tokenizer(targets, max_length=64, truncation=True, padding="max_length").input_ids
    lbl = [[(t if t != tokenizer.pad_token_id else -100) for t in seq] for seq in lbl]
    mi["labels"] = lbl
    return mi

tokenized = raw.map(preprocess, batched=True, remove_columns=["translation"])  # 动态预处理 :contentReference[oaicite:5]{index=5}


In [2]:
import math
from tqdm import tqdm
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')  # GPU自动加速

# 可定制的超参数
MODEL_CHECKPOINT = "t5-small"   # 可换成 t5-base、t5-large 等 :contentReference[oaicite:6]{index=6}
LEARNING_RATE     = 5e-5
NUM_BEAMS         = 4

# 加载模型
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)  # 加载预训练 T5 编码器-解码器 :contentReference[oaicite:7]{index=7}

# 若需调整层数 / 头数，可在此处重新定义 config
# e.g., model.config.num_layers = 4; model.config.num_heads = 8


2025-05-09 21:57:01.917934: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2025-05-09 21:57:01.917957: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2025-05-09 21:57:01.917965: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
I0000 00:00:1746799021.917975 1876143 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1746799021.917992 1876143 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can 

In [3]:
tf_train =( 
    tokenized.to_tf_dataset(
        columns=["input_ids", "attention_mask", "labels"],
        batch_size=16,
        shuffle=True
    )
    .cache()                        # 缓存到内存/磁盘
    .shuffle(2000, reshuffle_each_iteration=True)  # 增加buffer_size
    .prefetch(buffer_size=tf.data.AUTOTUNE)  # 异步预取
    .repeat()  # 避免每个epoch重新初始化
)


tf_test = tokenized.to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    batch_size=16,
    shuffle=False
)

# 定义 loss 函数和优化器
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.Adam()
lambda_weight = 0  # 正则项权重，可调

# 训练参数
epochs = 2
N_samples = len(tokenized)            # 注意是 tokenized，不是 raw
batch_size = 16
# 正确写法（匹配 drop_remainder=True 后的实际批次数）
#steps_per_epoch = N_samples // batch_size  # 整除
steps_per_epoch = math.ceil(N_samples / batch_size)

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    
    train_iter = iter(tf_train)

    if epoch >= 1:
        lambda_weight = 1e-1  # 正则项权重，可调

    pbar = tqdm(range(steps_per_epoch), desc="Training")
    for step in pbar:
    #for step in range(steps_per_epoch):
        batch = next(train_iter)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        # 外层 Tape：用于最终反向传播
        with tf.GradientTape() as outer_tape:
            # 内层 Tape：用于计算 loss 的梯度
            with tf.GradientTape() as inner_tape:
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels, training=True)
                
                logits = outputs.logits
                #loss = loss_fn(labels, logits)
                loss = outputs.loss

            # 计算梯度敏感性鲁棒性正则项（所有参数）
            grads = inner_tape.gradient(loss, model.trainable_variables)
            # 分层归一：每层求和后除以参数数目
            robust_penalty = tf.add_n([
                tf.reduce_sum(tf.abs(g)) / tf.cast(tf.size(g), tf.float32)
                for g in grads if g is not None
            ])

            robust_loss = lambda_weight * robust_penalty

            
            # 将 loss.numpy() 转为 float
            loss_val   = float(loss)
            robust_val = float(robust_loss)
            pbar.set_postfix({
                "loss":   f"{loss_val:.4f}",
                "robust_loss": f"{robust_val:.4f}"
            })
            

            total_loss = loss + robust_loss

        # 外层梯度计算：对 total_loss 求导并更新
        final_grads = outer_tape.gradient(total_loss, model.trainable_variables)
        #optimizer.apply_gradients(zip(final_grads, model.trainable_variables))

        clipped_grads = tf.clip_by_global_norm(final_grads, 1.0)[0]  # 全局裁剪
        optimizer.apply_gradients(zip(clipped_grads, model.trainable_variables))


    # 验证阶段（无扰动）
    total_val_loss = 0.0
    num_val_batches = 0

    for batch in tf_test:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels, training=False)
        logits = outputs.logits
        val_loss = loss_fn(labels, logits)

        total_val_loss += val_loss
        num_val_batches += 1

    avg_val_loss = total_val_loss / num_val_batches
    print(f"Validation Loss after epoch {epoch + 1}: {avg_val_loss:.4f}")


Epoch 1/2


Training: 100%|██████████████████| 166/166 [06:04<00:00,  2.20s/it, loss=0.7484, robust_loss=0.0000]
2025-05-09 22:03:34.298600: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Validation Loss after epoch 1: 0.1217

Epoch 2/2


Training: 100%|██████████████████| 166/166 [06:09<00:00,  2.23s/it, loss=0.7223, robust_loss=0.0137]


Validation Loss after epoch 2: 0.0835


2025-05-09 22:10:10.209329: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [4]:
# 推理示例
import logging
logging.basicConfig(level=logging.DEBUG)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

sample = "translate English to French: This is a test."
inp = tokenizer(sample, return_tensors="tf", padding="max_length", max_length=64)
with tf.device('/CPU:0'):
    out = model.generate(inp["input_ids"], attention_mask=inp["attention_mask"], num_beams=NUM_BEAMS)
print("Translation:", tokenizer.decode(out[0], skip_special_tokens=True))

I0000 00:00:1746799810.501293 1876143 service.cc:152] XLA service 0x600001f36f00 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1746799810.501453 1876143 service.cc:160]   StreamExecutor device (0): Host, Default Version
2025-05-09 22:10:10.532090: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1746799810.737328 1876143 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Translation: C'est un critère.


In [5]:
# 注入硬件错误
tfi.inject(model=model,
           confFile="/Users/lordtarn1shed/TensorFI2/experiments/layer-states/confFiles/sample.yaml",
           log_level="DEBUG")


# 评估故障
total_val_loss = 0.0
num_val_batches = 0

for batch in tf_test:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels, training=False)
    logits = outputs.logits
    val_loss = loss_fn(labels, logits)

    total_val_loss += val_loss
    num_val_batches += 1

avg_val_loss = total_val_loss / num_val_batches
print(f"Validation Loss after epoch {epoch + 1}: {avg_val_loss:.4f}")

DEBUG:root:Logging level set to DEBUG
INFO:root:Starting fault injection in user-specified layer 1
INFO:root:Completed injections... exiting


Total trainable layers (variables): 131
[Layer 0] Name: shared/shared/embeddings:0, Shape: (32128, 512), Elements: 16449536
[Layer 1] Name: tft5_for_conditional_generation/encoder/block_._0/layer_._0/SelfAttention/relative_attention_bias/embeddings:0, Shape: (32, 8), Elements: 256
[Layer 2] Name: tft5_for_conditional_generation/encoder/block_._0/layer_._0/SelfAttention/q/kernel:0, Shape: (512, 512), Elements: 262144
[Layer 3] Name: tft5_for_conditional_generation/encoder/block_._0/layer_._0/SelfAttention/k/kernel:0, Shape: (512, 512), Elements: 262144
[Layer 4] Name: tft5_for_conditional_generation/encoder/block_._0/layer_._0/SelfAttention/v/kernel:0, Shape: (512, 512), Elements: 262144
[Layer 5] Name: tft5_for_conditional_generation/encoder/block_._0/layer_._0/SelfAttention/o/kernel:0, Shape: (512, 512), Elements: 262144
[Layer 6] Name: tft5_for_conditional_generation/encoder/block_._0/layer_._0/layer_norm/weight:0, Shape: (512,), Elements: 512
[Layer 7] Name: tft5_for_conditional_gen

In [6]:
# 推理示例
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

sample = "translate English to French: This is a test."
inp = tokenizer(sample, return_tensors="tf", padding="max_length", max_length=64)
with tf.device('/CPU:0'):
    out = model.generate(inp["input_ids"], attention_mask=inp["attention_mask"], num_beams=NUM_BEAMS)
print("Translation:", tokenizer.decode(out[0], skip_special_tokens=True))

Translation: C'est un test.
