<a href="https://colab.research.google.com/github/LIU2016/AI-Practice-Tensorflow-Notes/blob/feature-0318/colab_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1、加载配置及模型
!pip install transformers datasets peft accelerate -q
安装 transformers、datasets、peft 和 accelerate 这四个 Python 库。

-q 选项使得安装过程的输出信息简洁一些。

In [8]:
!pip install transformers datasets peft accelerate -q
import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import json

# 加载 tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 添加LoRA适配器
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # 显示可训练参数（通常<1%）

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 147,456 || all params: 67,102,466 || trainable%: 0.2197


2、处理数据

In [9]:
# 1. 从 JSON 文件中读取数据
with open("/content/data/data-json1.json", "r", encoding="utf-8") as file:
    data = json.load(file)
dataset = Dataset.from_dict({"text": [item["text"] for item in data], "label": [item["label"] for item in data]})
print(dataset)

# 分词函数
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# 分词数据
tokenized_datasets = dataset.map(tokenize_function, batched=True)
print("分词后的数据示例：", tokenized_datasets[:2])  # 打印前 2 条分词后数据

tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.1)
print("训练集示例：", tokenized_datasets["train"][:2])  # 训练集前 2 条
print("测试集示例：", tokenized_datasets["test"][:2])  # 测试集前 2 条


Dataset({
    features: ['text', 'label'],
    num_rows: 5
})


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

分词后的数据示例： {'text': ['I love this movie!', 'This is terrible.'], 'label': [1, 0], 'input_ids': [[101, 1045, 2293, 2023, 3185, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

3、训练

In [10]:
# 训练配置
from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(
#     output_dir="./results",
#     per_device_train_batch_size=8,  # T4 GPU可承受的batch大小
#     num_train_epochs=1,
#     fp16=True,  # 启用混合精度
#     gradient_accumulation_steps=2,  # 累积梯度
# )

training_args = TrainingArguments(
    output_dir="./colab-02-results",         # 输出目录
    evaluation_strategy="epoch",    # 每个 epoch 进行一次评估
    learning_rate=2e-5,             # 学习率
    per_device_train_batch_size=8,  # 每个设备的训练批大小
    per_device_eval_batch_size=8,   # 每个设备的评估批大小
    num_train_epochs=3,             # 训练 epoch 数
    weight_decay=0.01,              # 权重衰减
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mliu69577500912040031[0m ([33mliu69577500912040031-tb[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


TrainOutput(global_step=1, training_loss=0.3498755097389221, metrics={'train_runtime': 70.7412, 'train_samples_per_second': 0.057, 'train_steps_per_second': 0.014, 'total_flos': 531681533952.0, 'train_loss': 0.3498755097389221, 'epoch': 1.0})

4、评估模型

In [11]:
# 评估模型
eval_results = trainer.evaluate()
print(f"评估结果: {eval_results}")

评估结果: {'eval_runtime': 2.9685, 'eval_samples_per_second': 0.337, 'eval_steps_per_second': 0.337, 'epoch': 1.0}


5、保存模型

In [12]:
# 保存模型
model.save_pretrained("./distilbert_model")
tokenizer.save_pretrained("./distilbert_model")

('./distilbert_model/tokenizer_config.json',
 './distilbert_model/special_tokens_map.json',
 './distilbert_model/vocab.txt',
 './distilbert_model/added_tokens.json',
 './distilbert_model/tokenizer.json')

6、使用模型

In [53]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# 加载保存的模型和 tokenizer
model_path = "./distilbert_model"
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)

# 检查标签映射（关键！）
print("模型标签映射:", model.config.id2label)  # 假设输出 {0: 'NEGATIVE', 1: 'POSITIVE'}

# 设置设备和评估模式
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# 需要预测的新文本数据
new_texts = ["This food is so nice"]

# 对新文本进行分词并移至设备
inputs = tokenizer(new_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}

# 推理
with torch.no_grad():
    outputs = model(**inputs)

# 获取预测结果
predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
print(f"获取预测结果：{predictions}")

# 根据模型配置的标签映射输出结果（动态适配）
labels = [model.config.id2label[pred] for pred in predictions]
print(f"预测结果: {labels}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


模型标签映射: {0: 'LABEL_0', 1: 'LABEL_1'}
获取预测结果：[1]
预测结果: ['LABEL_1']
