In [101]:
from datasets import load_dataset
from transformers import AutoTokenizer
from src import tensorfi2 as tfi

# 1. 下载 Tatoeba 英法并只取前 1% 样本以加速实验
raw = load_dataset("tatoeba", lang1="en", lang2="fr", split="train[:1%]")  # Tatoeba 英法平行语料 :contentReference[oaicite:4]{index=4}

# 2. 初始化分词器
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# 3. 预处理：添加翻译前缀，tokenize，并将 padding token 转为 -100 以忽略
def preprocess(examples):
    inputs  = ["translate English to French: " + ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    mi = tokenizer(inputs,  max_length=64, truncation=True, padding="max_length")
    lbl = tokenizer(targets, max_length=64, truncation=True, padding="max_length").input_ids
    lbl = [[(t if t != tokenizer.pad_token_id else -100) for t in seq] for seq in lbl]
    mi["labels"] = lbl
    return mi

tokenized = raw.map(preprocess, batched=True, remove_columns=["translation"])  # 动态预处理 :contentReference[oaicite:5]{index=5}


DEBUG:urllib3.connectionpool:Resetting dropped connection: huggingface.co
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/tatoeba/resolve/main/README.md HTTP/1.1" 307 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/Helsinki-NLP/tatoeba/resolve/main/README.md HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Resetting dropped connection: s3.amazonaws.com
DEBUG:urllib3.connectionpool:https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/datasets/tatoeba/tatoeba.py HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/tatoeba/resolve/00476f0f7e251c934e14f6e88c42a15e1b67c5a5/dataset_infos.json HTTP/1.1" 307 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/Helsinki-NLP/tatoeba/resolve/00476f0f7e251c934e14f6e88c42a15e1b67c5a5/dataset_infos.json HTTP/1.1" 404 0
DEBUG:filelock:Attempting to acquire lock 36951872736 on /Users/lordtarn1shed/.cache/huggingface/modules/datasets_mo

In [102]:
import math
from tqdm import tqdm
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM

# 可定制的超参数
MODEL_CHECKPOINT = "t5-small"   # 可换成 t5-base、t5-large 等 :contentReference[oaicite:6]{index=6}
LEARNING_RATE     = 5e-5
NUM_BEAMS         = 4

# 加载模型
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)  # 加载预训练 T5 编码器-解码器 :contentReference[oaicite:7]{index=7}

# 若需调整层数 / 头数，可在此处重新定义 config
# e.g., model.config.num_layers = 4; model.config.num_heads = 8


DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /t5-small/resolve/main/config.json HTTP/1.1" 200 0
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [103]:
# 若未来可用，可取消注释
#from tensorflow.keras import mixed_precision
# mixed_precision.set_global_policy('mixed_float16')

# 1. 环境检测
print("TF version:", tf.__version__)
print("Physical devices:", tf.config.list_physical_devices())

# 2. Optimizer & Loss
optimizer = tf.keras.optimizers.Adam()
loss_fn   = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
lambda_weight = 0.0
epochs = 2
batch_size = 16

tf_train = (
    tokenized
    .to_tf_dataset(
        columns=["input_ids","attention_mask","labels"],
        batch_size=batch_size,
        shuffle=False      # 先关掉 to_tf_dataset 内部的 shuffle
    )
    .cache()               # 缓存所有样本（已按顺序）
    .shuffle(1000, reshuffle_each_iteration=True)  # 关键参数        # 每个 epoch 在缓存上再打乱
    .prefetch(tf.data.AUTOTUNE)
)

tf_test = tokenized.to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    batch_size=8,
    shuffle=False
).cache().prefetch(tf.data.AUTOTUNE)


@tf.function  # 普通编译，无 JIT
def train_step(inputs, masks, labels, lam):
    with tf.GradientTape() as outer, tf.GradientTape() as inner:
        outputs = model(inputs,
                        attention_mask=masks,
                        labels=labels,
                        training=True)
        loss = outputs.loss
    grads = inner.gradient(loss, model.trainable_variables)

    # CPU 上 L1 正则聚合
    with tf.device('/CPU:0'):
        penalties = [
            tf.reduce_sum(tf.abs(tf.cast(g, tf.float32))) / tf.cast(tf.size(g), tf.float32)
            for g in grads if g is not None
        ]
        robust_pen = tf.reduce_sum(penalties) * lam

    total_loss = loss + robust_pen
    final_grads = outer.gradient(
        total_loss,
        model.trainable_variables,
        unconnected_gradients=tf.UnconnectedGradients.ZERO
    )
    clipped = [tf.clip_by_value(g, -1.0, 1.0) for g in final_grads]  # 更温和的裁剪
    optimizer.apply_gradients(zip(clipped, model.trainable_variables))
    del tape  # 显式释放持久化 Tape
    return loss

# --- 训练循环 ---
N = len(tokenized)
steps_per_epoch = math.ceil(N / batch_size)

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    if epoch >= 1:
        lambda_weight = 0.1

    # 重置指标 & 迭代器
    train_metric = tf.keras.metrics.Mean()
    # 方案1：显式迭代器
    train_iter = iter(tf_train)

    pbar = tqdm(range(steps_per_epoch), desc="Training")
    for step in pbar:
        # 取下一批
        batch = next(train_iter)
        loss = train_step(batch["input_ids"],
                          batch["attention_mask"],
                          batch["labels"],
                          lambda_weight)
        train_metric.update_state(loss)

        """
        # 将 loss.numpy() 转为 float
        loss_val   = loss.numpy().item()
        robust_val = (robust_loss).numpy().item()
        pbar.set_postfix({
            "loss":   f"{loss_val:.4f}",
            "robust_loss": f"{robust_val:.4f}"
        })
        """
        
    # 5.2 验证
    val_metric = tf.keras.metrics.Mean()
    for batch in tf_test:
        outputs  = model(batch["input_ids"],
                         attention_mask=batch["attention_mask"],
                         labels=batch["labels"],
                         training=False)
        val_loss = loss_fn(batch["labels"], outputs.logits)
        val_metric.update_state(val_loss)
    print(f">>> Validation Loss after epoch {epoch + 1}: {val_metric.result().numpy()}")



TF version: 2.19.0
Physical devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

Epoch 1/2


Training: 100%|██████████| 166/166 [01:23<00:00,  1.99it/s]


>>> Validation Loss after epoch 1: 0.2177095115184784

Epoch 2/2


Training: 100%|██████████| 166/166 [01:22<00:00,  2.02it/s]


>>> Validation Loss after epoch 2: 0.2177095115184784


In [104]:
import logging
logging.basicConfig(level=logging.DEBUG)
# 推理示例
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

sample = "translate English to French: This is a test."
inp = tokenizer(sample, return_tensors="tf", padding="max_length", max_length=64)
with tf.device('/CPU:0'):
    out = model.generate(inp["input_ids"], attention_mask=inp["attention_mask"], num_beams=NUM_BEAMS)
print("Translation:", tokenizer.decode(out[0], skip_special_tokens=True))



Translation: Il s'agit d'un essai.


In [105]:
# 注入硬件错误
tfi.inject(model=model,
           confFile="/Users/lordtarn1shed/TensorFI2/experiments/layer-states/confFiles/sample.yaml",
           log_level="DEBUG")


# 评估故障
   # 5.2 验证
val_metric = tf.keras.metrics.Mean()
for batch in tf_test:
    outputs  = model(batch["input_ids"],
                     attention_mask=batch["attention_mask"],
                     labels=batch["labels"],
                     training=False)
    val_loss = loss_fn(batch["labels"], outputs.logits)
    val_metric.update_state(val_loss)

print(f">>> Validation Loss after epoch {epoch + 1}: {val_metric.result().numpy()}")

DEBUG:root:Logging level set to DEBUG
INFO:root:Starting fault injection in user-specified layer 1
INFO:root:Completed injections... exiting


Total trainable layers (variables): 131
[Layer 0] Name: shared/shared/embeddings:0, Shape: (32128, 512), Elements: 16449536
[Layer 1] Name: tft5_for_conditional_generation_18/encoder/block_._0/layer_._0/SelfAttention/relative_attention_bias/embeddings:0, Shape: (32, 8), Elements: 256
[Layer 2] Name: tft5_for_conditional_generation_18/encoder/block_._0/layer_._0/SelfAttention/q/kernel:0, Shape: (512, 512), Elements: 262144
[Layer 3] Name: tft5_for_conditional_generation_18/encoder/block_._0/layer_._0/SelfAttention/k/kernel:0, Shape: (512, 512), Elements: 262144
[Layer 4] Name: tft5_for_conditional_generation_18/encoder/block_._0/layer_._0/SelfAttention/v/kernel:0, Shape: (512, 512), Elements: 262144
[Layer 5] Name: tft5_for_conditional_generation_18/encoder/block_._0/layer_._0/SelfAttention/o/kernel:0, Shape: (512, 512), Elements: 262144
[Layer 6] Name: tft5_for_conditional_generation_18/encoder/block_._0/layer_._0/layer_norm/weight:0, Shape: (512,), Elements: 512
[Layer 7] Name: tft5_f

In [106]:
# 推理示例
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

sample = "translate English to French: This is a test."
inp = tokenizer(sample, return_tensors="tf", padding="max_length", max_length=64)
with tf.device('/CPU:0'):
    out = model.generate(inp["input_ids"], attention_mask=inp["attention_mask"], num_beams=NUM_BEAMS)
print("Translation:", tokenizer.decode(out[0], skip_special_tokens=True))

Translation: Il s'agit d'un test.
