In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

# 1. 下载 Tatoeba 英法并只取前 1% 样本以加速实验
raw = load_dataset("tatoeba", lang1="en", lang2="fr", split="train[:1%]")  # Tatoeba 英法平行语料 :contentReference[oaicite:4]{index=4}

# 2. 初始化分词器
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# 3. 预处理：添加翻译前缀，tokenize，并将 padding token 转为 -100 以忽略
def preprocess(examples):
    inputs  = ["translate English to French: " + ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    mi = tokenizer(inputs,  max_length=64, truncation=True, padding="max_length")
    lbl = tokenizer(targets, max_length=64, truncation=True, padding="max_length").input_ids
    lbl = [[(t if t != tokenizer.pad_token_id else -100) for t in seq] for seq in lbl]
    mi["labels"] = lbl
    return mi

tokenized = raw.map(preprocess, batched=True, remove_columns=["translation"])  # 动态预处理 :contentReference[oaicite:5]{index=5}


In [2]:
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM

# 可定制的超参数
MODEL_CHECKPOINT = "t5-small"   # 可换成 t5-base、t5-large 等 :contentReference[oaicite:6]{index=6}
LEARNING_RATE     = 5e-5
NUM_BEAMS         = 4

# 加载模型
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)  # 加载预训练 T5 编码器-解码器 :contentReference[oaicite:7]{index=7}

# 若需调整层数 / 头数，可在此处重新定义 config
# e.g., model.config.num_layers = 4; model.config.num_heads = 8


2025-04-23 23:04:23.704870: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2025-04-23 23:04:23.704912: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2025-04-23 23:04:23.704918: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
I0000 00:00:1745420663.704931  273416 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1745420663.704962  273416 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can 

In [3]:
# 转为 tf.data.Dataset
tf_train = tokenized.to_tf_dataset(
    columns=["input_ids","attention_mask"],
    label_cols=["labels"],
    batch_size=16,
    shuffle=True,
)
tf_test = tokenized.to_tf_dataset(
    columns=["input_ids","attention_mask"],
    label_cols=["labels"],
    batch_size=16,
    shuffle=False,
)

# 编译与训练
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer)
model.fit(tf_train, epochs=2)                  # 训练 :contentReference[oaicite:8]{index=8}
results = model.evaluate(tf_test)              # 评估
print("Test loss:", results)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Epoch 1/2


2025-04-23 23:04:30.159604: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
E0000 00:00:1745420670.638479  273416 meta_optimizer.cc:967] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp.


Epoch 2/2
Test loss: 0.6644615530967712


In [4]:
# 推理示例
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

sample = "translate English to French: This is a test."
inp = tokenizer(sample, return_tensors="tf", padding="max_length", max_length=64)
with tf.device('/CPU:0'):
    out = model.generate(inp["input_ids"], attention_mask=inp["attention_mask"], num_beams=NUM_BEAMS)
print("Translation:", tokenizer.decode(out[0], skip_special_tokens=True))

I0000 00:00:1745420820.138547  273416 service.cc:152] XLA service 0x600003454500 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745420820.138564  273416 service.cc:160]   StreamExecutor device (0): Host, Default Version
2025-04-23 23:07:00.159105: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1745420820.327738  273416 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Translation: C'est un test.
