In [11]:
from datasets import load_dataset
from transformers import AutoTokenizer
from src import tensorfi2 as tfi

# 1. 下载 Tatoeba 英法并只取前 1% 样本以加速实验
raw = load_dataset("tatoeba", lang1="en", lang2="fr", split="train[:1%]")  # Tatoeba 英法平行语料 :contentReference[oaicite:4]{index=4}

# 2. 初始化分词器
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# 3. 预处理：添加翻译前缀，tokenize，并将 padding token 转为 -100 以忽略
def preprocess(examples):
    # 1. 添加翻译前缀并编码输入
    inputs = ["translate English to French: " + ex["en"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=64, truncation=True, padding="max_length")

    # 2. 编码目标文本并生成 decoder_input_ids
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(
            [ex["fr"] for ex in examples["translation"]],
            max_length=64,
            truncation=True,
            padding="max_length"
        )
    
    # 3. 生成 decoder_input_ids（右移的 labels）
    labels = targets["input_ids"]
    decoder_input_ids = [
        [tokenizer.pad_token_id] + seq[:-1]  # 在开头添加 pad，并右移
        for seq in labels
    ]
    
    # 4. 替换 labels 中的 pad_token_id 为 -100（忽略损失计算）
    labels = [
        [(t if t != tokenizer.pad_token_id else -100) for t in seq]
        for seq in labels
    ]

    model_inputs["labels"] = labels
    model_inputs["decoder_input_ids"] = decoder_input_ids  # 关键修复
    
    return model_inputs

tokenized = raw.map(preprocess, batched=True, remove_columns=["translation"])  # 动态预处理 :contentReference[oaicite:5]{index=5}


In [12]:
import math
from tqdm import tqdm
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"
import tensorflow as tf
import numpy as np
from transformers import TFAutoModelForSeq2SeqLM
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')  # GPU自动加速

# 可定制的超参数
MODEL_CHECKPOINT = "t5-small"   # 可换成 t5-base、t5-large 等 :contentReference[oaicite:6]{index=6}
LEARNING_RATE     = 5e-5
NUM_BEAMS         = 4

# 加载模型
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)  # 加载预训练 T5 编码器-解码器 :contentReference[oaicite:7]{index=7}

# 若需调整层数 / 头数，可在此处重新定义 config
# e.g., model.config.num_layers = 4; model.config.num_heads = 8


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [13]:
tf_train =( 
    tokenized.to_tf_dataset(
        columns=["input_ids", "attention_mask", "decoder_input_ids", "labels"],
        batch_size=16,
        shuffle=True
    )
    .cache()                        # 缓存到内存/磁盘
    .shuffle(2000, reshuffle_each_iteration=True)  # 增加buffer_size
    .prefetch(buffer_size=tf.data.AUTOTUNE)  # 异步预取
    .repeat()  # 避免每个epoch重新初始化
)

tf_test = tokenized.to_tf_dataset(
    columns=["input_ids", "attention_mask", "decoder_input_ids", "labels"],
    batch_size=16,
    shuffle=False
).repeat().prefetch(tf.data.AUTOTUNE)

In [14]:
def bit_flip_attack(model, flip_prob=1e-4):
    """模拟权重中的随机比特翻转"""
    original_weights = model.get_weights()
    corrupted_weights = []
    
    for w in original_weights:
        if len(w.shape) == 0:  # 跳过标量（如某些优化器状态）
            corrupted_weights.append(w)
            continue
        
        # 生成随机掩码选择翻转位置
        mask = tf.random.uniform(w.shape) < flip_prob
        # 生成随机翻转量（±1e-3模拟比特错误）
        delta = tf.random.uniform(w.shape, minval=-1e-3, maxval=1e-3) * tf.cast(mask, tf.float32)
        corrupted = w + delta
        corrupted_weights.append(corrupted.numpy())
    
    model.set_weights(corrupted_weights)

def evaluate_robustness(model, test_dataset, num_samples, flip_prob=1e-4):
    test_dataset = test_dataset.repeat()
    test_iter = iter(test_dataset)
    
    # 动态计算实际可评估的最大样本数
    total_samples = min(num_samples, len(test_dataset) * test_dataset._batch_size)  # 近似计算
    num_batches = total_samples // test_dataset._batch_size
    
    total_sdc = 0
    total_crash = 0
    total_tested = 0
    original_weights = model.get_weights()

    # 1. 生成基准输出（带进度条）
    baseline_outputs = []
    print("\nGenerating Baseline Predictions...")
    for _ in tqdm(range(num_batches), desc="Baseline"):
        batch = next(test_iter)
        logits = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            decoder_input_ids=batch["decoder_input_ids"],
            training=False
        ).logits
        baseline_outputs.append(tf.argmax(logits, axis=-1))
    
    # 2. 注入错误评估（带实时指标显示）
    print("\nEvaluating Robustness with Bit Flips...")
    test_iter = iter(test_dataset)  # 重置迭代器
    pbar = tqdm(total=total_samples, desc="Testing", unit="sample")
    
    for i in range(num_batches):
        batch = next(test_iter)
        batch_size = batch["input_ids"].shape[0]
        
        try:
            bit_flip_attack(model, flip_prob)
            
            logits = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                decoder_input_ids=batch["decoder_input_ids"],
                training=False
            ).logits
            preds = tf.argmax(logits, axis=-1)
            
            # 计算当前批次的 SDC
            sdc = tf.reduce_sum(tf.cast(preds != baseline_outputs[i], tf.int32)).numpy()
            total_sdc += sdc
            
        except Exception as e:
            total_crash += batch_size
            
        finally:
            model.set_weights(original_weights)
            total_tested += batch_size
            
            # 更新进度条描述
            current_sdc = total_sdc / total_tested if total_tested > 0 else 0
            current_crash = total_crash / total_tested if total_tested > 0 else 0
            pbar.set_postfix({
                "SDC%": f"{current_sdc*100:.2f}%",
                "Crash%": f"{current_crash*100:.2f}%"
            })
            pbar.update(batch_size)  # 按实际批次大小更新进度
            
    pbar.close()
    
    return total_sdc / total_tested, total_crash / total_tested

In [15]:
# 定义 loss 函数和优化器
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.Adam()
lambda_weight = 0  # 正则项权重，可调

# 训练参数
epochs = 2
N_samples = len(tokenized)            # 注意是 tokenized，不是 raw
batch_size = 16
# 正确写法（匹配 drop_remainder=True 后的实际批次数）
#steps_per_epoch = N_samples // batch_size  # 整除
steps_per_epoch = math.ceil(N_samples / batch_size)

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    
    train_iter = iter(tf_train)

    if epoch >= 1:
        lambda_weight = 1e-1  # 正则项权重，可调

    pbar = tqdm(range(steps_per_epoch), desc="Training")
    for step in pbar:
    #for step in range(steps_per_epoch):
        batch = next(train_iter)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        # 外层 Tape：用于最终反向传播
        with tf.GradientTape() as outer_tape:
            # 内层 Tape：用于计算 loss 的梯度
            with tf.GradientTape() as inner_tape:
                outputs = model(input_ids, attention_mask=attention_mask, decoder_input_ids=batch["decoder_input_ids"], labels=labels, training=True)
                
                logits = outputs.logits
                #loss = loss_fn(labels, logits)
                loss = outputs.loss

            # 计算梯度敏感性鲁棒性正则项（所有参数）
            grads = inner_tape.gradient(loss, model.trainable_variables)
            # 分层归一：每层求和后除以参数数目
            robust_penalty = tf.add_n([
                tf.reduce_sum(tf.abs(g)) / tf.cast(tf.size(g), tf.float32)
                for g in grads if g is not None
            ])

            robust_loss = lambda_weight * robust_penalty

            
            # 将 loss.numpy() 转为 float
            loss_val   = float(loss)
            robust_val = float(robust_loss)
            pbar.set_postfix({
                "loss":   f"{loss_val:.4f}",
                "robust_loss": f"{robust_val:.4f}"
            })
            

            total_loss = loss + robust_loss

        # 外层梯度计算：对 total_loss 求导并更新
        final_grads = outer_tape.gradient(total_loss, model.trainable_variables)
        #optimizer.apply_gradients(zip(final_grads, model.trainable_variables))

        clipped_grads = tf.clip_by_global_norm(final_grads, 1.0)[0]  # 全局裁剪
        optimizer.apply_gradients(zip(clipped_grads, model.trainable_variables))

    print(f"epoch {epoch + 1} complete")


    val_steps = math.ceil(len(tokenized) / batch_size)
    total_val_loss = 0.0

    for i, batch in zip(range(val_steps), tf_test):
        # 提取输入参数
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        decoder_input_ids = batch["decoder_input_ids"]
        labels = batch["labels"]
        
        # 前向传播
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            labels=labels,
            training=False
        )
        
        # 关键修复：确保损失为标量
        loss_value = float(outputs.loss.numpy())  # 或 outputs.loss.numpy().item()
        total_val_loss += loss_value

    avg_val_loss = total_val_loss / val_steps
    print(f"Validation Loss: {avg_val_loss:.4f}")  # 现在 avg_val_loss 是 float
    # 新增鲁棒性评估
    sdc_rate, crash_rate = evaluate_robustness(model, tf_test, num_samples=100, flip_prob=1e-4)
    print(f"SDC Rate: {sdc_rate:.4f}, Crash Rate: {crash_rate:.4f}")


Epoch 1/2


Training: 100%|██████████| 166/166 [07:13<00:00,  2.61s/it, loss=0.8035, robust_loss=0.0000]


epoch 1 complete


  loss_value = float(outputs.loss.numpy())  # 或 outputs.loss.numpy().item()


Validation Loss: 0.4813


TypeError: The dataset is infinite.

In [None]:
# 推理示例
import logging
logging.basicConfig(level=logging.DEBUG)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

sample = "translate English to French: This is a test."
inp = tokenizer(sample, return_tensors="tf", padding="max_length", max_length=64)
with tf.device('/CPU:0'):
    out = model.generate(inp["input_ids"], attention_mask=inp["attention_mask"], num_beams=NUM_BEAMS)
print("Translation:", tokenizer.decode(out[0], skip_special_tokens=True))

In [None]:
# 注入硬件错误
tfi.inject(model=model,
           confFile="/Users/lordtarn1shed/TensorFI2/experiments/layer-states/confFiles/sample.yaml",
           log_level="DEBUG")


# 评估故障
total_val_loss = 0.0
num_val_batches = 0

for batch in tf_test:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels, training=False)
    logits = outputs.logits
    val_loss = loss_fn(labels, logits)

    total_val_loss += val_loss
    num_val_batches += 1

avg_val_loss = total_val_loss / num_val_batches
print(f"Validation Loss after epoch {epoch + 1}: {avg_val_loss:.4f}")

sdc_rate, crash_rate = evaluate_robustness(model, tf_test, num_samples=100, flip_prob=1e-4)
print(f"SDC Rate: {sdc_rate:.4f}, Crash Rate: {crash_rate:.4f}")

In [None]:
# 推理示例
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

sample = "translate English to French: This is a test."
inp = tokenizer(sample, return_tensors="tf", padding="max_length", max_length=64)
with tf.device('/CPU:0'):
    out = model.generate(inp["input_ids"], attention_mask=inp["attention_mask"], num_beams=NUM_BEAMS)
print("Translation:", tokenizer.decode(out[0], skip_special_tokens=True))