In [1]:
import os
import torch


from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
import numpy as np

#SDK模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('AI-ModelScope/t5-base')

  from .autonotebook import tqdm as notebook_tqdm


Downloading Model to directory: C:\Users\1\.cache\modelscope\hub\AI-ModelScope/t5-base




In [2]:
# 设置镜像地址
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["HF_HUB_ENDPOINT"] = "https://hf-mirror.com"

In [3]:
# model_name = "/home/valiantsec/phb/models/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [4]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)


In [5]:
# loading dataset
dataset = load_dataset(
    "financial_phrasebank", "sentences_allagree", trust_remote_code=True
)

Using the latest cached version of the module from C:\Users\1\.cache\huggingface\modules\datasets_modules\datasets\financial_phrasebank\550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141 (last modified on Fri Jul 18 19:31:03 2025) since it couldn't be found locally at financial_phrasebank, or remotely on the Hugging Face Hub.


In [6]:
set(dataset["train"]["label"])

{0, 1, 2}

In [7]:
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2037
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 227
    })
})

In [8]:
dataset["train"].features["label"].names

['negative', 'neutral', 'positive']

In [9]:
classes = dataset["train"].features["label"].names
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["label"]]},
    batched=True,
    num_proc=1, # 使用4可以吗？
)

Map: 100%|██████████| 2037/2037 [00:00<00:00, 59021.65 examples/s]
Map: 100%|██████████| 227/227 [00:00<00:00, 19420.85 examples/s]


In [10]:
dataset["train"][0]

{'sentence': 'Basware finances the acquisition with a bank loan .',
 'label': 1,
 'text_label': 'neutral'}

In [63]:
# data preprocessing
text_column = "sentence"
label_column = "text_label"
max_length = 128


def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(
        inputs,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    labels = tokenizer(
        targets,
        max_length=5,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    labels = labels["input_ids"]
    # labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

Running tokenizer on dataset: 100%|██████████| 2037/2037 [00:00<00:00, 20860.61 examples/s]
Running tokenizer on dataset: 100%|██████████| 227/227 [00:00<00:00, 11093.20 examples/s]


In [43]:
print(train_dataset[0])

{'input_ids': [6653, 3404, 14272, 8, 6566, 28, 3, 9, 2137, 2289, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [7163, 1, 0]}


In [81]:
model.config.label_pad_token_id = -100

In [82]:
# @title Step 11: Defining a Data Collator
from transformers import DataCollatorForSeq2Seq

# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, label_pad_token_id=-100, pad_to_multiple_of=8)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, pad_to_multiple_of=8)
seq2seq_test_data = data_collator((train_dataset[0], train_dataset[1], train_dataset[2]))
seq2seq_test_data["labels"]

tensor([[7163,    1,    0,    0,    0, -100, -100, -100],
        [1465,    1,    0,    0,    0, -100, -100, -100],
        [7163,    1,    0,    0,    0, -100, -100, -100]])

In [65]:
from transformers import DataCollatorWithPadding
default_collator = DataCollatorWithPadding(tokenizer=tokenizer)
default_test_data = default_collator((train_dataset[0], train_dataset[1], train_dataset[2]))
default_test_data["labels"]

tensor([[7163,    1,    0,    0,    0],
        [1465,    1,    0,    0,    0],
        [7163,    1,    0,    0,    0]])

In [29]:
seq2seq_test_data.keys(), default_test_data.keys()

(dict_keys(['input_ids', 'attention_mask', 'labels']),
 dict_keys(['input_ids', 'attention_mask', 'labels']))

In [36]:
for k in seq2seq_test_data.keys():
    eq = seq2seq_test_data[k] == default_test_data[k]
    eq = eq.all()
    print(k, eq)

input_ids tensor(True)
attention_mask tensor(True)
labels tensor(True)


In [27]:
default_test_data["input_ids"].shape

torch.Size([3, 128])

In [34]:
128 * 3

384

In [84]:
import evaluate

# 加载ROUGE评估指标
# metric = load_metric("rouge")
metric = evaluate.load("rouge")

In [85]:
result = metric.compute(
        predictions=["hello world"], 
        references=["hello new world"],
        use_stemmer=True  # 使用词干提取以提高匹配度
    )
result

{'rouge1': 0.8, 'rouge2': 0.0, 'rougeL': 0.8, 'rougeLsum': 0.8}

In [None]:
# def infer(eval_pred):
#     predictions, labels = eval_pred
#     with torch.no_grad():
#         # TODO: 为什么predictions的第一个值为pad
#         predictions = model_cpu.generate(predictions) # TODO 为什么不需要输入 mask
#         predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#         labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#         labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     return predictions, labels

In [86]:
def compute_metrics(eval_pred):
    """生成式任务的评估函数"""
    predictions, labels = eval_pred
    predictions = predictions[0].argmax(-1)

    # 解码生成的文本（模型预测）
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # 解码目标文本（真实标签）
    # 替换-100为pad_token_id以正确解码
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 可选：简单后处理（如去除换行符）
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    right = 0
    for pred, label in zip(decoded_preds, decoded_labels):
        if pred == label:
            right += 1

    acc = right / len(decoded_labels)

    # 计算ROUGE分数
    result = metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,  # 使用词干提取以提高匹配度
    )
    result.update({"acc": acc})

    # 提取主要ROUGE指标（如ROUGE-1、ROUGE-2、ROUGE-L）
    # 取平均值作为最终结果
    # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [91]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    "output",
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    gradient_accumulation_steps=1,
    # auto_find_batch_size=True, # 自动设置batch_size（学习）
    per_device_train_batch_size=64,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=3,
    logging_strategy="steps",  # 按步骤记录日志
    logging_steps=10,  # 每10步记录一次训练损失
    label_names=["labels"],
)



In [92]:
model.train()
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [93]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Acc
1,0.6225,0.089998,0.8678,0.0,0.8678,0.8678,0.8678
2,0.0589,0.040104,0.9339,0.0,0.9339,0.9339,0.9339
3,0.0495,0.029371,0.9515,0.0,0.9515,0.9515,0.9515


TrainOutput(global_step=96, training_loss=1.2615819776741166, metrics={'train_runtime': 34.2866, 'train_samples_per_second': 178.233, 'train_steps_per_second': 2.8, 'total_flos': 938639988228096.0, 'train_loss': 1.2615819776741166, 'epoch': 3.0})