In [1]:
from datasets import load_dataset, load_dataset_builder
from transformers import AutoTokenizer
from src import tensorfi2 as tfi
from transformers import MT5TokenizerFast


builder = load_dataset_builder("IWSLT/iwslt2017", name="iwslt2017-en-zh")
num_train_samples = builder.info.splits["train"].num_examples
print(f"Total samples in IWSLT/iwslt2017 en-zh train split: {num_train_samples}")
# Calculate 0.1% of the samples
samples_to_load = int(0.002 * num_train_samples)
print(f"Loading {samples_to_load} samples (0.1% of train split)")
# 1. 下载 wmt14 (英德对) 并只取 0.1% 样本以加速实验
raw = load_dataset("IWSLT/iwslt2017", name="iwslt2017-en-zh", split=f"train[:{samples_to_load}]")

# 2. 初始化分词器
tokenizer = MT5TokenizerFast.from_pretrained("google/mt5-small")

# 3. 预处理：添加翻译前缀，tokenize，并将 padding token 转为 -100 以忽略
def preprocess(examples):
    inputs  = ["translate English to Chinese: " + ex["en"] for ex in examples["translation"]]
    targets = [ex["zh"] for ex in examples["translation"]]
    mi = tokenizer(inputs,  max_length=64, truncation=True, padding="max_length")
    lbl = tokenizer(targets, max_length=64, truncation=True, padding="max_length").input_ids
    lbl = [[(t if t != tokenizer.pad_token_id else -100) for t in seq] for seq in lbl]
    mi["labels"] = lbl
    return mi

tokenized = raw.map(preprocess, batched=True, remove_columns=["translation"])  # 动态预处理 :contentReference[oaicite:5]{index=5}


Total samples in IWSLT/iwslt2017 en-zh train split: 231266
Loading 462 samples (0.1% of train split)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
import math
from tqdm import tqdm
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM
from tensorflow.keras import mixed_precision
#mixed_precision.set_global_policy('mixed_float16')  # GPU自动加速

# 可定制的超参数
MODEL_CHECKPOINT = "google/mt5-small"   # 可换成 t5-base、t5-large 等 :contentReference[oaicite:6]{index=6}
LEARNING_RATE     = 5e-5
NUM_BEAMS         = 4

# 加载模型
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)  # 加载预训练 T5 编码器-解码器 :contentReference[oaicite:7]{index=7}

# 若需调整层数 / 头数，可在此处重新定义 config
# e.g., model.config.num_layers = 4; model.config.num_heads = 8


2025-05-12 23:46:04.551571: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2025-05-12 23:46:04.551646: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2025-05-12 23:46:04.551663: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
I0000 00:00:1747064764.551977 2449745 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1747064764.552017 2449745 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFMT5ForConditionalGeneration.

All the layers of TFMT5ForConditionalGeneration were initialized from the model checkpoint at google/mt5-small.
If your task is similar to the task the model of the checkpoin

In [3]:
tf_train =( 
    tokenized.to_tf_dataset(
        columns=["input_ids", "attention_mask", "labels"],
        batch_size=16,
        shuffle=True
    )
    .cache()                        # 缓存到内存/磁盘
    .shuffle(2000, reshuffle_each_iteration=True)  # 增加buffer_size
    .prefetch(buffer_size=tf.data.AUTOTUNE)  # 异步预取
    .repeat()  # 避免每个epoch重新初始化
)


tf_test = tokenized.to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    batch_size=16,
    shuffle=False
)

In [4]:
def safe_generate(model, input_ids, attention_mask, **gen_kwargs):
    # 1. 保存原始精度策略
    orig_policy = mixed_precision.global_policy().name

    try:
        # 2. 切换到 float32，再执行 generate
        mixed_precision.set_global_policy('float32')
        return model.generate(input_ids, attention_mask=attention_mask, use_cache=False, **gen_kwargs)

    except tf.errors.NotFoundError as e:
        print("⚠️ generate 报 NotFoundError，切到 CPU 重试:", e)
        # 3. 在 CPU 上重试
        with tf.device('/CPU:0'):
            return model.generate(input_ids, attention_mask=attention_mask, use_cache=False, **gen_kwargs)

    finally:
        # 4. 恢复原始精度策略
        mixed_precision.set_global_policy(orig_policy)

def evaluate_robustness(model,
                        tf_test,
                        num_batches: int = 100,
                        conf_file: str = "conf/sample.yaml"):
    """
    Evaluate SDC rate and crash rate on a subset of tf_test.
    
    Args:
      model      : 已加载的 TFAutoModelForSeq2SeqLM 实例
      tf_test    : tf.data.Dataset，包含 input_ids, attention_mask, labels
      num_batches: 取多少个 batch 进行测试
      flip_prob  : 注入 bit‐flip 失败概率
      conf_file  : TensorFI 的配置文件路径
    
    Returns:
      sdc_rate   : (num_sdc / total_samples)
      crash_rate : (num_crash / total_samples)
    """
    
    # 1. 先收集基线输出
    baseline_outputs = []
    input_batches = []
    for i, batch in enumerate(tf_test):
        if i >= num_batches:
            break
        input_ids      = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        # 使用 beam search 生成结果
        preds = model.generate(input_ids,
                               attention_mask=attention_mask,
                               num_beams=NUM_BEAMS,
                               max_length=64,
                               use_cache=False)
        baseline_outputs.append(preds)
        input_batches.append((input_ids, attention_mask))

    total = len(baseline_outputs)
    sdc_count   = 0
    crash_count = 0

    # 注入前保存
    orig_weights = model.get_weights()  

    # 2. 对每个 batch 注入故障并重推理
    for (input_ids, attention_mask), base_preds in zip(input_batches, baseline_outputs):
        try:
            # 在模型上注入硬件故障
            tfi.inject(model=model,
                       confFile=conf_file,
                       log_level="ERROR")

            # 带故障推理
            faulty_preds = safe_generate(model,            # ← 这里一定要传 model
                                        input_ids,
                                        attention_mask=attention_mask,
                                        num_beams=NUM_BEAMS,
                                        max_length=64)


            # 若推理结果与基线不同，则计作 SDC
            if not tf.reduce_all(faulty_preds == base_preds):
                sdc_count += 1

        except Exception as e:
            # 推理过程中崩溃
            crash_count += 1

        finally:
            # 恢复模型到无注入状态
            # 恢复模型权重
            model.set_weights(orig_weights)


    sdc_rate   = sdc_count / total
    crash_rate = crash_count / total
    return sdc_rate, crash_rate, total


In [5]:
# 定义 loss 函数和优化器
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.Adam()
lambda_weight = 0  # 正则项权重，可调

# 训练参数
epochs = 4
N_samples = len(tokenized)            # 注意是 tokenized，不是 raw
batch_size = 16
# 正确写法（匹配 drop_remainder=True 后的实际批次数）
#steps_per_epoch = N_samples // batch_size  # 整除
steps_per_epoch = math.ceil(N_samples / batch_size)

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    
    train_iter = iter(tf_train)

    if epoch >= 2:
        lambda_weight = 1e-2  # 正则项权重，可调

    pbar = tqdm(range(steps_per_epoch), desc="Training")
    for step in pbar:
    #for step in range(steps_per_epoch):
        batch = next(train_iter)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        # 外层 Tape：用于最终反向传播
        with tf.GradientTape() as outer_tape:
            # 内层 Tape：用于计算 loss 的梯度
            with tf.GradientTape() as inner_tape:
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels, training=True)
                
                logits = outputs.logits
                #loss = loss_fn(labels, logits)
                loss = outputs.loss

            # 计算梯度敏感性鲁棒性正则项（所有参数）
            grads = inner_tape.gradient(loss, model.trainable_variables)
            # 分层归一：每层求和后除以参数数目
            robust_penalty = tf.add_n([
                tf.reduce_sum(tf.abs(g)) / tf.cast(tf.size(g), tf.float32)
                for g in grads if g is not None
            ])

            robust_loss = lambda_weight * robust_penalty

            
            # 将 loss.numpy() 转为 float
            loss_val   = float(loss)
            robust_val = float(robust_loss)
            pbar.set_postfix({
                "loss":   f"{loss_val:.4f}",
                "robust_loss": f"{robust_val:.4f}"
            })
            

            total_loss = loss + robust_loss

        # 外层梯度计算：对 total_loss 求导并更新
        final_grads = outer_tape.gradient(total_loss, model.trainable_variables)
        #optimizer.apply_gradients(zip(final_grads, model.trainable_variables))

        clipped_grads = tf.clip_by_global_norm(final_grads, 1.0)[0]  # 全局裁剪
        optimizer.apply_gradients(zip(clipped_grads, model.trainable_variables))


    # 验证阶段（无扰动）
    total_val_loss = 0.0
    num_val_batches = 0

    for batch in tf_test:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels, training=False)
        logits = outputs.logits
        val_loss = loss_fn(labels, logits)

        total_val_loss += val_loss
        num_val_batches += 1

    avg_val_loss = total_val_loss / num_val_batches
    print(f"Validation Loss after epoch {epoch + 1}: {avg_val_loss:.4f}")


Epoch 1/4


Training: 100%|████████████████████| 29/29 [04:16<00:00,  8.84s/it, loss=7.8312, robust_loss=0.0000]
2025-05-12 23:50:37.030312: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Validation Loss after epoch 1: 2.3223

Epoch 2/4


Training: 100%|████████████████████| 29/29 [04:33<00:00,  9.42s/it, loss=5.8706, robust_loss=0.0000]
2025-05-12 23:55:26.071918: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Validation Loss after epoch 2: 1.6174

Epoch 3/4


Training: 100%|████████████████████| 29/29 [04:41<00:00,  9.69s/it, loss=4.7420, robust_loss=0.0017]


Validation Loss after epoch 3: 1.3148

Epoch 4/4


Training: 100%|████████████████████| 29/29 [04:24<00:00,  9.13s/it, loss=4.7650, robust_loss=0.0014]


Validation Loss after epoch 4: 1.0691


2025-05-13 00:05:01.248842: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
# —— 调用鲁棒性评估 —— 
sdc_rate, crash_rate, total = evaluate_robustness(
    model=model,
    tf_test=tf_test,
    num_batches=10,            # 或者根据你测试数据量调整
    conf_file="/Users/lordtarn1shed/TensorFI2/experiments/layer-states/confFiles/sample.yaml"
)
print(f"Epoch {epoch + 1}  >>  SDC Rate: {sdc_rate:.2f}, Crash Rate: {crash_rate:.2f}, Total: {total:.0f}")

Total trainable layers (variables): 190
[Layer 0] Name: shared/shared/embeddings:0, Shape: (250112, 512), Elements: 128057344
[Layer 1] Name: tfmt5_for_conditional_generation/encoder/block_._0/layer_._0/SelfAttention/relative_attention_bias/embeddings:0, Shape: (32, 6), Elements: 192
[Layer 2] Name: tfmt5_for_conditional_generation/encoder/block_._0/layer_._0/SelfAttention/q/kernel:0, Shape: (512, 384), Elements: 196608
[Layer 3] Name: tfmt5_for_conditional_generation/encoder/block_._0/layer_._0/SelfAttention/k/kernel:0, Shape: (512, 384), Elements: 196608
[Layer 4] Name: tfmt5_for_conditional_generation/encoder/block_._0/layer_._0/SelfAttention/v/kernel:0, Shape: (512, 384), Elements: 196608
[Layer 5] Name: tfmt5_for_conditional_generation/encoder/block_._0/layer_._0/SelfAttention/o/kernel:0, Shape: (384, 512), Elements: 196608
[Layer 6] Name: tfmt5_for_conditional_generation/encoder/block_._0/layer_._0/layer_norm/weight:0, Shape: (512,), Elements: 512
[Layer 7] Name: tfmt5_for_condit

In [None]:
# 推理示例
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

sample = "translate English to Chinese: This is a test."
inp = tokenizer(sample, return_tensors="tf", padding="max_length", max_length=64)
with tf.device('/CPU:0'):
    out = model.generate(inp["input_ids"], attention_mask=inp["attention_mask"], num_beams=NUM_BEAMS)
print("Translation:", tokenizer.decode(out[0], skip_special_tokens=True))